In [1]:
import numpy as np
import pandas as pd

# Working with Multiple Dataframes and Transforming

In [2]:
# Sample Dataframe 1.
data1 = [[500, 24],[1000,54], [1500,56], [2000,45]]
df_1_sample = pd.DataFrame(data = data1)
df_1_sample
# Note that if we don't specify column names it assigns an integers 0 and 1 (i.e., uses the column index for naming).

Unnamed: 0,0,1
0,500,24
1,1000,54
2,1500,56
3,2000,45


In [3]:
# Sample Dataframe 2.
data2 = [[500, 23],[1000,34], [1500,54], [2000,32]]
df_2_sample = pd.DataFrame(data2)
df_2_sample

Unnamed: 0,0,1
0,500,23
1,1000,34
2,1500,54
3,2000,32


In [4]:
# I can create a list of the dataframes.
df_list = [df_1_sample, df_2_sample]
df_list

[      0   1
 0   500  24
 1  1000  54
 2  1500  56
 3  2000  45,
       0   1
 0   500  23
 1  1000  34
 2  1500  54
 3  2000  32]

In [5]:
df_list[0] # I can call the data from a dataframe in a list of dataframes.

Unnamed: 0,0,1
0,500,24
1,1000,54
2,1500,56
3,2000,45


In [6]:
# I can create a derived feature by combining data from one column and strings.
df_1_sample['col'] = 'test'+df_1_sample[0].astype(str).str[:2]
df_1_sample

Unnamed: 0,0,1,col
0,500,24,test50
1,1000,54,test10
2,1500,56,test15
3,2000,45,test20


In [7]:
# I can create another derived feature by combining data from two numerical columns.
df_1_sample['col2'] = df_1_sample[0]*df_2_sample[1]
df_1_sample

Unnamed: 0,0,1,col,col2
0,500,24,test50,11500
1,1000,54,test10,34000
2,1500,56,test15,81000
3,2000,45,test20,64000


In [8]:
# I can create another derived feature using a for loop.
for r in range(len(df_1_sample)): # "r" for row. Use naming that makes sense for your variables.
    df_1_sample.at[r, 'col3'] = df_1_sample.at[r, 0] + df_1_sample.at[r, 1] # Result of integers sum is a float.
    # Could use a .at, .iat, .loc, .iloc functions with changes
df_1_sample['col3'] = df_1_sample['col3'].astype(int) # Converting column to integer
df_1_sample

Unnamed: 0,0,1,col,col2,col3
0,500,24,test50,11500,524
1,1000,54,test10,34000,1054
2,1500,56,test15,81000,1556
3,2000,45,test20,64000,2045


In [9]:
# I can create another derived feature using a while loop.
# Note that in the while loop I need to define starting value of r and counter.
r = 0 # Starting value of r.
while r <= len(df_1_sample)-1: # "r" for row. Use naming that makes sense for your variables.
    df_1_sample.at[r, 'col4'] = df_1_sample.at[r, 0] + df_1_sample.at[r, 1] # Result of integers sum is a float.
    # Could use a .at, .iat, .loc, .iloc functions with changes
    r = r + 1 # While loop counter.
df_1_sample['col4'] = df_1_sample['col4'].astype(float) # Converting column to float
df_1_sample

Unnamed: 0,0,1,col,col2,col3,col4
0,500,24,test50,11500,524,524.0
1,1000,54,test10,34000,1054,1054.0
2,1500,56,test15,81000,1556,1556.0
3,2000,45,test20,64000,2045,2045.0


In [10]:
df_1_sample.describe() # Recall the output is a dataframe and can filter.

Unnamed: 0,0,1,col2,col3,col4
count,4.0,4.0,4.0,4.0,4.0
mean,1250.0,44.75,47625.0,1294.75,1294.75
std,645.497224,14.637281,30944.506782,653.998662,653.998662
min,500.0,24.0,11500.0,524.0,524.0
25%,875.0,39.75,28375.0,921.5,921.5
50%,1250.0,49.5,49000.0,1305.0,1305.0
75%,1625.0,54.5,68250.0,1678.25,1678.25
max,2000.0,56.0,81000.0,2045.0,2045.0


In [11]:
df_1_sample.describe(include = 'object') # Include options are: 'None' (defaul, only numerical), 'all', 'object'.
# In my experience it is cleaner to obtaint .describe() measures separately for numerical and non-numerical.

Unnamed: 0,col
count,4
unique,4
top,test50
freq,1


In [12]:
df_1_sample.info() # Gives information on the dataframe.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       4 non-null      int64  
 1   1       4 non-null      int64  
 2   col     4 non-null      object 
 3   col2    4 non-null      int64  
 4   col3    4 non-null      int32  
 5   col4    4 non-null      float64
dtypes: float64(1), int32(1), int64(3), object(1)
memory usage: 304.0+ bytes


In [13]:
df_1_sample

Unnamed: 0,0,1,col,col2,col3,col4
0,500,24,test50,11500,524,524.0
1,1000,54,test10,34000,1054,1054.0
2,1500,56,test15,81000,1556,1556.0
3,2000,45,test20,64000,2045,2045.0


In [14]:
# I can drop a columns and reorder by selecting specific columns in the order I want and redefining the dataframe variable.
# Could also use the df.drop(columns=['col_to_drop']) 
df_1_sample = df_1_sample[['col', 1]]
df_1_sample

Unnamed: 0,col,1
0,test50,24
1,test10,54
2,test15,56
3,test20,45


In [15]:
# I can extract the column values as an array.
col_one_arr = df_1_sample['col'].values
print(col_one_arr) # Using the print function to show values.
col_one_arr

['test50' 'test10' 'test15' 'test20']


array(['test50', 'test10', 'test15', 'test20'], dtype=object)

In [16]:
# I can extract the column values to a list.
col_one_list = df_1_sample['col'].tolist()
print(col_one_list)
# I can extract the column values to a numpy array.
col_one_arr2 = df_1_sample['col'].to_numpy()
print(col_one_arr2)

['test50', 'test10', 'test15', 'test20']
['test50' 'test10' 'test15' 'test20']


In [17]:
type(col_one_arr2) # Checking data type.

numpy.ndarray

In [18]:
# I can define a new dataframe.
new_data_frame = pd.DataFrame({'col': ['Hello', 'World'],
                               1: [1, 3]
                              })
new_data_frame

Unnamed: 0,col,1
0,Hello,1
1,World,3


In [19]:
# I can add new rows by concatenating two dataframes with the same column names.
df_1_sample = pd.concat([new_data_frame, df_1_sample], ignore_index = True)
df_1_sample

Unnamed: 0,col,1
0,Hello,1
1,World,3
2,test50,24
3,test10,54
4,test15,56
5,test20,45


In [20]:
df_1_sample.T # Transposing dataframe

Unnamed: 0,0,1,2,3,4,5
col,Hello,World,test50,test10,test15,test20
1,1,3,24,54,56,45


In [21]:
# I can resent index from the transposed dataframe
df_1_sample = df_1_sample.T.reset_index(drop = True)
df_1_sample

Unnamed: 0,0,1,2,3,4,5
0,Hello,World,test50,test10,test15,test20
1,1,3,24,54,56,45


In [22]:
# I can rename columns using data from a row. In this case, the first row.
df_1_sample.rename(columns=df_1_sample.iloc[0], inplace = True) 
df_1_sample

Unnamed: 0,Hello,World,test50,test10,test15,test20
0,Hello,World,test50,test10,test15,test20
1,1,3,24,54,56,45


In [23]:
# I can drop a row by selecting the other rows (i.e., excluding the first row)
df_1_sample = df_1_sample.iloc[1:].reset_index(drop = True)
df_1_sample

Unnamed: 0,Hello,World,test50,test10,test15,test20
0,1,3,24,54,56,45


# Workign with Column Names in Dataframes

In [24]:
# Empty dataframe with specific columns.
data_frame_cols = pd.DataFrame(columns = ['Val1', 'Val2', *['VHS' + str(i) for i in range(5, 8, 1)]])
data_frame_cols

Unnamed: 0,Val1,Val2,VHS5,VHS6,VHS7


In [25]:
# Extracting column values from a dataframe.
data_frame_cols.columns.values

array(['Val1', 'Val2', 'VHS5', 'VHS6', 'VHS7'], dtype=object)

In [26]:
# Extracting column values or feature names.
feature_names = data_frame_cols.columns.values
feature_names

array(['Val1', 'Val2', 'VHS5', 'VHS6', 'VHS7'], dtype=object)

In [27]:
# Defining data values.
data_values = ['Hello', 'World', 1, 3, 4]
data_values

['Hello', 'World', 1, 3, 4]

In [28]:
type(data_values)

list

In [None]:
# Creating a dataframe from a list of values 
data_frame = pd.DataFrame(data = [data_values], columns = feature_names)
data_frame

# Working with Strings and Lists

In [None]:
# Defining a list of strings.
df_names = ['df_1_sample33', 'df_2_sample33']
df_names

In [None]:
df_names[1] # Calling element in position 1 from a list of strings.

In [None]:
# Splitting an element from a list into another list using a delimiter.
df_names_sections = df_names[0].split("_")
df_names_sections

In [None]:
df_names_sections[1] # Calling a element of a list of strings.

In [None]:
df_names_sections[2][2:4] # Selecting partial string within element 2.

In [None]:
df_names_sections[2][-2:] # Selecting the last two characters from element 2.

# Notebook End