In [119]:
import pandas as pd
import matplotlib.pyplot as plt

In [120]:
#convert to ascii to get rid of the special characters
df = pd.read_excel('data/sample_data.xlsx', encoding="ascii")

In [121]:
df.columns

Index(['customer_id', 'gender', 'age', 'total_score', 'industry', 'title',
       'skills'],
      dtype='object')

In [122]:
df.head()

Unnamed: 0,customer_id,gender,age,total_score,industry,title,skills
0,1,,,3.45,Information Technology and Services,,
1,2,,,3.44,Internet,,
2,3,,0-0,10.49,,,
3,4,Male,0-0,3.46,Construction,"Functional Architect, Business Technology","SharePoint,Management ,Business Intelligence ,..."
4,5,,,3.45,,,


In [123]:
df.describe()

Unnamed: 0,customer_id,total_score
count,5022.0,5022.0
mean,2511.5,5.853035
std,1449.870856,3.336261
min,1.0,0.45
25%,1256.25,3.43
50%,2511.5,3.47
75%,3766.75,10.46
max,5022.0,10.94


In [124]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5022 entries, 0 to 5021
Data columns (total 7 columns):
customer_id    5022 non-null int64
gender         757 non-null object
age            3009 non-null object
total_score    5022 non-null float64
industry       4148 non-null object
title          2593 non-null object
skills         2366 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 274.7+ KB


In [125]:
print(df.shape)

(5022, 7)


In [126]:
#check each variable for missing values
print("Total NA:", sum(df.age.isnull().values.ravel()))
print("NA %:", "{:.1%}".format(sum(df.age.isnull().values.ravel())/sum(df.isnull().values.ravel())))

Total NA: 2013
NA %: 16.5%


In [127]:
print("Total NA:", sum(df.gender.isnull().values.ravel()))
print("NA %:", "{:.1%}".format(sum(df.gender.isnull().values.ravel())/sum(df.isnull().values.ravel())))

Total NA: 4265
NA %: 34.9%


In [128]:
print("Total NA:", sum(df.industry.isnull().values))
print("NA %:", "{:.1%}".format(sum(df.industry.isnull().values.ravel())/sum(df.isnull().values.ravel())))

Total NA: 874
NA %: 7.1%


In [129]:
print("Total NA:", sum(df.title.isnull().values))
print("NA %:", "{:.1%}".format(sum(df.title.isnull().values.ravel())/sum(df.isnull().values.ravel())))

Total NA: 2429
NA %: 19.8%


In [130]:
print("Total NA:", sum(df.skills.isnull().values))
print("NA %:", "{:.1%}".format(sum(df.skills.isnull().values.ravel())/sum(df.isnull().values.ravel())))

Total NA: 2656
NA %: 21.7%


In [131]:
#Cleaning industry

In [140]:
#slice the industry into a dataframe
df_ind = df[['total_score', 'industry']] 
print("Total rows:", df_ind.shape)
print("Total NA:", sum(df_ind.isnull().values))
print("NA %:", "{:.1%}".format(sum(df_ind.isnull().values.ravel())/sum(df_ind.isnull().values.ravel())))
if isinstance(df['industry'], str):
    print("Encoding is ordinary string")
else:
    print("Encoding is not a string")
    
#check the type
print("Encoding is ", type(df['industry']))
df_ind.to_csv('df_ind_initial.csv', sep = ',')

Total rows: (5022, 2)
Total NA: [  0 874]
NA %: 100.0%
Encoding is not a string
Encoding is  <class 'pandas.core.series.Series'>


In [141]:
type(df_ind)

pandas.core.frame.DataFrame

In [142]:
#remove missing values

df_ind = df_ind[pd.notnull(df_ind['industry'])]

print("Total rows:", df_ind.shape)
print("Total NA:", sum(df_ind.isnull().values))
print("NA %:", "{:.1%}".format(sum(df_ind.isnull().values.ravel())/sum(df_ind.isnull().values.ravel())))
df_ind.to_csv('df_ind_no_na.csv', sep = ',')

Total rows: (4148, 2)
Total NA: [0 0]
NA %: nan%


In [143]:
#remove all '_x000D' and '_x000D_' from the beginning of the values

type(df_ind)

df_ind['industry'] = df_ind['industry'].str.replace('_x000D_', '')


df_ind.to_csv('df_ind_no_x000D.csv', sep = ',')

In [144]:
#remove all columns with 'http' from the industry
df_ind = df_ind[~df_ind['industry'].str.contains('http')]

df_ind.to_csv('df_ind_no_http.csv', sep = ',')

In [145]:
#replace '&' with 'AND'
df_ind['industry'] = df_ind['industry'].str.replace("&","AND")
df_ind.to_csv('df_ind_no&.csv', sep = ',')

In [146]:
#convert all to lowercase
df_ind['industry'] = df_ind.industry.str.lower()
df_ind.head()
df_ind.to_csv('df_ind_lower.csv', sep = ',')

In [147]:
#split by comma
df_ind = pd.concat([pd.Series(row['total_score'], row['industry'].split(','))
           for _, row in df_ind.iterrows()]).reset_index()


df_ind.to_csv('df_ind_split_comma.csv', sep = ',')

In [148]:
#add column names
column_names = ["industry",'total_score']

df_ind.columns = column_names
df_ind.to_csv('df_ind_column_names.csv', sep = ',')

In [149]:
#strip spaces from the industry values
df_ind['industry'] = df_ind['industry'].str.strip()

df_ind.to_csv('df_ind_stripped_column.csv', sep = ',')

In [150]:
# remove columns that begin with /
df_ind = df_ind[~df_ind['industry'].str.startswith('/')]

df_ind.to_csv('df_ind_no_slash.csv', sep = ',')

In [151]:
#add a new column and populate dummy values

df_ind['dummies'] = 1


df_ind.to_csv('df_ind_add_cols.csv', sep = ',')

In [152]:
#crosstab to set up for multiple regression

df_ind_cross = pd.crosstab(df_ind.total_score, df_ind.industry)

df_ind_cross.to_csv('df_ind_crosstab.csv', sep = ',')

In [153]:
type(df_ind_cross)

pandas.core.frame.DataFrame

In [154]:
#multiple regression
from sklearn.linear_model import LinearRegression

df_ind_cross

industry,Unnamed: 1_level_0,accounting,accounting and accounting services,acounting,advertising,advertising and marketing,aerospace and defense,airlines,airlines/aviation,airports and air services,...,waste treatment,water and water treatment,website hosting and internet-related services,wellness and fitness,wholesale,wine and spirits,wineries and breweries,wire and cable,wireless,writing and editing
total_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2.10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2.17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2.26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2.31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2.36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2.37,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2.38,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2.39,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2.40,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2.41,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
