Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 66 additions & 5 deletions build.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,82 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('./data/conversion_data.csv')
#print df.head(10)




def get_categorical_variables(df):
return []
col_names = df.columns.values
ls = []
for ele in col_names:
#print df[ele].value_counts().index
if len(df[ele].value_counts().index)<10:
#print("{} is a categorical column as it has only {} categories\n".format(ele,len(df[ele].value_counts().index)))
ls.append(ele)
df2 = pd.DataFrame(df,columns=ls)
#print df.info()
return df2



def get_numerical_variables(df):
return []
col_names = df.columns.values
ls = []
for ele in col_names:
if (df[ele].dtype == 'int64' or df[ele].dtype == 'float64') & (ele not in get_categorical_variables(df).columns.values):
ls.append(ele)
df2 = pd.DataFrame(df,columns=ls)
return df2




def get_numerical_variables_percentile(df):
pass
df_1 = get_numerical_variables(df)
dic1 = df_1.describe().values
ls = zip(*dic1)
df_2 = pd.DataFrame(ls,columns=['count','mean','std','min','25th Percentile','50th Percentile','75th Percentile','max'])
ls2 = ['age','total_pages_visited']
df_2['variable name']=ls2
df_2 = df_2[['variable name','mean','25th Percentile','50th Percentile','75th Percentile']]
df_2['median'] = df_2['50th Percentile']
df_2 = df_2[['variable name','mean','median','25th Percentile','50th Percentile','75th Percentile']]
return df_2


def get_categorical_variables_modes(df):
pass
df_3 = get_categorical_variables(df)
dic = {}
for ele in df_3.columns.values:
#print('mode of {} is {}'.format(ele,df_3[ele].mode().values[0]))
if ele not in dic:
dic[ele] = df_3[ele].mode().values[0]
print dic
df_x = pd.DataFrame(dic,index=[0],columns=dic.keys())
return df_x


def get_missing_values_count(df):
pass
cols = df.columns.values
cnt = 0
df_tmp = df
col_ls = []
val_ls = []
for col in cols:
col_ls.append(col)
for e in df[col].values:
if pd.isnull(e) == True:
cnt+=1
val_ls.append(cnt)
cnt = 0
print zip(col_ls,val_ls)
dfz = pd.DataFrame([col_ls,val_ls])
dfz = dfz.T
dfz.rename(columns={0:'var_name',1:'missing_value_count'},inplace=True)
return dfz


def plot_histogram_with_numerical_values(df):
Expand Down
Binary file added build.pyc
Binary file not shown.
Binary file added tests/__init__.pyc
Binary file not shown.
Binary file added tests/test_get_categorical_variables.pyc
Binary file not shown.