In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#convert to ascii to get rid of the special characters
df = pd.read_excel('data/sample_data.xlsx', encoding="ascii")

In [3]:
df.columns

Index(['customer_id', 'gender', 'age', 'total_score', 'industry', 'title',
       'skills'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,customer_id,gender,age,total_score,industry,title,skills
0,1,,,3.45,Information Technology and Services,,
1,2,,,3.44,Internet,,
2,3,,0-0,10.49,,,
3,4,Male,0-0,3.46,Construction,"Functional Architect, Business Technology","SharePoint,Management ,Business Intelligence ,..."
4,5,,,3.45,,,


In [5]:
df.describe()

Unnamed: 0,customer_id,total_score
count,5022.0,5022.0
mean,2511.5,5.853035
std,1449.870856,3.336261
min,1.0,0.45
25%,1256.25,3.43
50%,2511.5,3.47
75%,3766.75,10.46
max,5022.0,10.94


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5022 entries, 0 to 5021
Data columns (total 7 columns):
customer_id    5022 non-null int64
gender         757 non-null object
age            3009 non-null object
total_score    5022 non-null float64
industry       4148 non-null object
title          2593 non-null object
skills         2366 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 274.7+ KB


In [7]:
print(df.shape)

(5022, 7)


In [8]:
#check each variable for missing values
print("Total NA:", sum(df.age.isnull().values.ravel()))
print("NA %:", "{:.1%}".format(sum(df.age.isnull().values.ravel())/sum(df.isnull().values.ravel())))

Total NA: 2013
NA %: 16.5%


In [9]:
print("Total NA:", sum(df.gender.isnull().values.ravel()))
print("NA %:", "{:.1%}".format(sum(df.gender.isnull().values.ravel())/sum(df.isnull().values.ravel())))

Total NA: 4265
NA %: 34.9%


In [10]:
print("Total NA:", sum(df.industry.isnull().values))
print("NA %:", "{:.1%}".format(sum(df.industry.isnull().values.ravel())/sum(df.isnull().values.ravel())))

Total NA: 874
NA %: 7.1%


In [11]:
print("Total NA:", sum(df.title.isnull().values))
print("NA %:", "{:.1%}".format(sum(df.title.isnull().values.ravel())/sum(df.isnull().values.ravel())))

Total NA: 2429
NA %: 19.8%


In [12]:
print("Total NA:", sum(df.skills.isnull().values))
print("NA %:", "{:.1%}".format(sum(df.skills.isnull().values.ravel())/sum(df.isnull().values.ravel())))

Total NA: 2656
NA %: 21.7%


In [13]:
#Cleaning industry

In [14]:
#slice the industry into a dataframe
df_ind = df[['customer_id','industry']] 
print("Total rows:", df_ind.shape)
print("Total NA:", sum(df_ind.isnull().values))
print("NA %:", "{:.1%}".format(sum(df_ind.isnull().values.ravel())/sum(df_ind.isnull().values.ravel())))
if isinstance(df['industry'], str):
    print("Encoding is ordinary string")
else:
    print("Encoding is not a string")
    
#check the type
print("Encoding is ", type(df['industry']))
df_ind.to_csv('df_ind_initial.csv', sep = ',')

Total rows: (5022, 2)
Total NA: [  0 874]
NA %: 100.0%
Encoding is not a string
Encoding is  <class 'pandas.core.series.Series'>


In [15]:
type(df_ind)

pandas.core.frame.DataFrame

In [16]:
#remove missing values

df_ind = df_ind[pd.notnull(df_ind['industry'])]

print("Total rows:", df_ind.shape)
print("Total NA:", sum(df_ind.isnull().values))
print("NA %:", "{:.1%}".format(sum(df_ind.isnull().values.ravel())/sum(df_ind.isnull().values.ravel())))
df_ind.to_csv('df_ind_no_na.csv', sep = ',')

Total rows: (4148, 2)
Total NA: [0 0]
NA %: nan%


In [17]:
#remove all '_x000D' and '_x000D_' from the beginning of the values

type(df_ind)

df_ind['industry'] = df_ind['industry'].str.replace('_x000D_', '')


df_ind.to_csv('df_ind_no_x000D.csv', sep = ',')

In [18]:
#remove all columns with 'http' from the industry
df_ind = df_ind[~df_ind['industry'].str.contains('http')]

df_ind.to_csv('df_ind_no_http.csv', sep = ',')

In [19]:
#replace '&' with 'AND'
df_ind['industry'] = df_ind['industry'].str.replace("&","AND")
df_ind.to_csv('df_ind_no&.csv', sep = ',')

In [20]:
#convert all to lowercase
df_ind['industry'] = df_ind.industry.str.lower()
df_ind.head()
df_ind.to_csv('df_ind_lower.csv', sep = ',')

In [21]:
#split by comma
df_ind = pd.concat([pd.Series(row['customer_id'], row['industry'].split(','))
           for _, row in df_ind.iterrows()]).reset_index()


df_ind.to_csv('df_ind_split_comma.csv', sep = ',')

In [50]:
#add column names
column_names = ["industry",'customer_id']

df_ind.columns = column_names
df_ind.to_csv('df_ind_column_names.csv', sep = ',')

In [71]:
#strip spaces from the industry values
df_ind['industry'] = df_ind['industry'].str.strip()

df_ind.to_csv('df_ind_stripped_column.csv', sep = ',')

In [403]:
#add a new column and populate dummy values

df_ind['dummies'] = 1


df_ind.to_csv('df_ind_add_cols.csv', sep = ',')

In [404]:
#crosstab to set up for PCA

df_ind_cross = pd.crosstab(df_ind.customer_id, df_ind.industry)

df_ind_cross.to_csv('df_ind_crosstab.csv', sep = ',')

In [405]:
type(df_ind_cross)

pandas.core.frame.DataFrame