# Steps
<b>Objective</b>: predict benefits of Healthcare for All obtains through mail donations by “lapsed” doners 
<br><b>Gather data</b>: data spread across several files that need to be merged 
<br><b>Clean data</b>: empty cells, weird entries, gender encoding 
<br><b>Explore</b>: answer questions like: group donors (by gender, by state), what’s the distribution by group, are there noticeable group differences? 
<br><b>Process</b>: Look at what model to apply (might need to change how some columns are expressed) 
<br><b>Training</b>: create model to predict donation amount of each doner 
<br><b>Validate</b>: check predictions against a little subset of the data 
<br><b>Present</b>: show findings

In [2]:
# Import libraries for the dependencies
import numpy as np # library for numerical analysis, good for arrays
import pandas as pd  # library for data analysis (tabular data) based on numpy
import matplotlib.pyplot as plt # library for plotting 
%matplotlib inline # magic command to display plots in the notebook
import seaborn as sns # library for plotting based on matplotlib

## 1. Gather data

In [6]:
df = pd.read_csv('file1.csv') # read the csv file 
# pd.read_csv('file2.txt', sep='\t') # read the txt file
df2 = pd.read_csv('file2.csv') # read the csv file

In [None]:
display(df.head()) # display the first 5 rows
#display(df.tail()) # display the last 5 rows
print(df.shape) # number of rows and columns
#df.columns # column names
#len(df.columns) # number of columns
df.info() # information about the dataframe
df.columns == df2.columns # check if the columns are the same in both dataframes 
#df.loc[:,["gender","id"]] # show columns by name
#df.iloc[:,[2,0]] # show columns by index
df['gender'].value_counts(dropna=False) # count the number of values in a column (NaN values are included)

## 2. Clean data

### Standardize columns 

In [None]:
df.columns = df.columns.str.lower().str.strip() # convert column names to lower case and strip spaces 
#df[['state','id']] # show columns by name or reorder columns
df = df.reindex(sorted(df.columns), axis=1) # reorder columns by alphabetical order 
df = df.rename(columns={'id':'ID'}) # rename a column 

### Merge data 

In [None]:
#df3 = pd.merge(df, df2, on='id') # merge the two dataframes on the column 'id'
df3 = pd.concat([df, df2], axis=1) # concatenate the two dataframes on the columns (axis=1)

### Add new columns, filter rows, reset index

In [None]:
df3 = df3.reset_index(drop=False) # reset the indexes to avoid 2 rows with the same index. Drop=False to keep the old index as a column 
df3copy = df3.copy() # create a copy of the dataframe (useful for not losing the original dataframe) 
#df3['new column'] = df3['column1'] + df3['column2'] # create a new column by adding two columns 
#df3[(df3['target_d']==100) & (df3['very_generous'] == True) ] # filter rows by multiple conditions (AND = "&", OR = "|", NOT = ~, IN = isin)
#df3.T # transpose the dataframe 

### Fix typos, convert to correct data type, deal with empty values

In [None]:
# clean values with typos (remove the "A" and convert to float)
def clean_values(x):
    x = str(x)
    if ('A' in x ):
        x = x.replace('A','')
    x = float(x)
    return x 
df3['column'] = df3['column'].apply(clean_values) # apply the function to the column
lambda x: float(x.replace("A","")) # same as above but using lambda function (quick and dirty and anonymous function)
df3['column'] = df3['column'].map(lambda x: float(x.replace("A",""))) # apply the anon def directly to the column
df3['column'] = df3['column'].astype(float) # convert column to float 
df3['column'] =  pd.to_numeric(df3['column'], errors='coerce') # convert column to float and replace NaN values with 0
df3['column'] =  pd.to_datetime(df3['column'], errors='coerce') # convert column to datetime and replace NaN values with 0
df3.isna().sum() # count the number of NaN values in each column 

### Remove duplicates

In [None]:
df3 = df3.drop_duplicates() # remove all duplicates 
df3 = df3.drop_duplicates(subset=['column1','column2']) # remove duplicates based on two columns 
df3 = df3.dropna() # drop rows with NaN values


## 3. Explore

### Drop missing values
This approach is acceptable when we don't know what could be the value of the NA & we are not going to drop a lot of rows (we don't want to end up with a tiny dataframe!).

In [None]:
null_df = pd.DataFrame(round(df3.isna().sum()/len(df3),4)*100) # count the number of NaN values in each column and convert to percentage
null_df = null_df.rename(columns = {'index': 'header_name', 0:'percent_nulls'}) # add a column showing null percentages per column 
columns_drop = null_df[null_df['percent_nulls'] > 90] # get columns with more than 90% of null values 
df3 = df3.drop(columns_drop.index, axis=1) # drop columns with more than 90% of null values

### Replace missing values
Sometimes, we can replace the missing values by a "reasonable guess". This "guess" can be determined by the domain knowledge (we already know their source and we know what's the value) 

In [None]:
df3['column'] = df3['column'].fillna(df3['column'].mean()) # replace NaN values with the mean of the column (average)
df3['column'] = df3['column'].fillna(df3['column'].median()) # replace NaN values with the median of the column (middle value)
df3['column'] = df3['column'].fillna(df3['column'].mode()[0]) # replace NaN values with the mode of the column (most frequent value)

### Statistical check (dispersion)

In [None]:
df3['column'].min() # min & max dispersion 
df3['column'].std() # standard deviation (how spread out the values are) 
df3['column'].var() # variance (how spread out the values are, but the values are squared, the higher the variance, the higher the standard deviation) 
df3['column'].quantile([0.25,0.5,0.75]) # quartiles (25%, 50%, 75%) 
df3['column'].describe() # summary statistics (count, mean, std, min, 25%, 50%, 75%, max) 

### Plot the data 

In [None]:
import warnings
warnings.filterwarnings('ignore') # ignore warnings for cleaner output 

In [None]:
# plot a histogram 
df3['column'].plot(kind='hist', bins=20, figsize=(10,5), color='red')
plt.title('Histogram of column') 
plt.xlabel('column') 
plt.ylabel('Frequency') 
plt.tight_layout() # adjust the layout to avoid overlapping 
plt.show() 
# different approach 
fig, ax = plt.subplots(figsize=(10,5)) # create a figure and axes
ax.hist(df3['column'], bins=20, color='red') # plot a histogram using 
# using seaborn 
sns.distplot(df3['column'], bins=20, kde=False, color='red') # plot a histogram 
plt.show() 
# interactive plot using plotly 
import plotly.express as px 
fig = px.histogram(df3, x='column', nbins=20, title='Histogram of column') 
fig.show() 

### Checking correlations

In [None]:
df3.corr(method='pearson') # correlation matrix (pearson, kendall, spearman) 
# pearson: linear correlation (default), kendall: non-linear correlation, spearman: non-linear correlation (monotonic relationship)

In [None]:
import scipy.stats as stats # import the stats module from scipy 
round(stats.pearsonr(df3['column1'], df3['column2'])[0],2) # gives the correlation coefficient and the p-value (correlation between two columns)