In [62]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score as acc, confusion_matrix as conf
from sklearn.model_selection import train_test_split as holdout

In [343]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as holdout
from sklearn.metrics import accuracy_score as acc, confusion_matrix as conf
import numpy as np

class MARK(object):    
    def __init__(self):
        self.model = LogisticRegression()
        self.data_name = '<<No data yet. Please import data using [package].read_csv("file_name.csv")>>'
        self.X_shape = (0,0)
        self.Y_shape = (0,0)
        self.Y_name = '""'
        self.holdout = 20
        self.X = None
        self.objects = None
        self.numerics = None
        self.Y = None
        self.mapping = {}
        self.columns = None
        self.classification = 0
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None
        self.model_name = str(self.model).split('(')[0]
        self.clean_Y = None
        
    def __repr__(self):
        text = '''Easy Data Science Pipeline for UNSW MARK
---------------------------------------------
Holdout Size = {} <<To change validation split, [package].holdout = 20>>\nModel = {}\n
Data Description
---------------------------------------------
Data Source = {}
\nX = {} rows and {} columns
Y ({}) = {} rows and {} unique values {}
\nX columns = {}...{}'''.format(self.holdout,
                        self.model_name,
                        self.data_name,
                        self.X_shape[0], self.X_shape[1],
                        self.Y_name, self.Y_shape[0], self.Y_shape[1],
                        list(self.mapping.values()),
                        self.columns[0:5], self.columns[-5:])
        return text
        
        
        
    def read_csv(self,data):
        try:
            self.data_name = data

            if '.csv' not in data:
                data += '.csv'

            self.X = pd.read_csv(data)
            self.X_shape = self.X.shape
            self.numerics = self.X[(self.X.dtypes != 'O').index[self.X.dtypes != 'O']]
            self.objects = self.X[(self.X.dtypes == 'O').index[self.X.dtypes == 'O']]
            self.columns = list(self.X.columns)
            
            print('''Data Source = {}
\nX = {} rows and {} columns'''.format(self.data_name,self.X_shape[0], self.X_shape[1]))
        except:
            print('''Data failed to be read in\n\nYou either got the file name wrong or the directory is wrong
            Maybe try ["./file_name.csv"]''')
            
            
            
    def summary(self):
        
        data = pd.concat([self.X.nunique(), len(self.X) - self.X.count(), self.X.mode().iloc[0],
          self.numerics.min(), self.numerics.mean(), self.numerics.median(), self.numerics.max()], 1)
        
        data.columns = ['No Uniques','Missing','Mode','Min','Mean','Median','Max']
        data = data.fillna('')
        data = data.sort_values(by = ['Missing','No Uniques'], ascending = False)
        return data
    
    
    def classify(self, target):
        try:
            self.Y_name = "{}".format(target)
            self.Y_shape = (len(self.X[target]), self.X[target].nunique())
            self.Y = self.X.pop(target)
            self.X_shape = self.X.shape
            self.numerics = self.X[(self.X.dtypes != 'O').index[self.X.dtypes != 'O']]
            self.objects = self.X[(self.X.dtypes == 'O').index[self.X.dtypes == 'O']]
            self.columns = list(self.X.columns)

            self.Y = self.Y.astype('category')
            self.clean_Y = self.Y.cat.codes
            self.mapping = dict(enumerate(self.Y.cat.categories))

            print('\nTarget = {}\n{} rows and {} unique values {}'.format(self.Y_name,self.Y_shape[0], 
                                                                          self.Y_shape[1],
                                                                         list(self.mapping.values())))
            self.classification = 1
            
        except:
            print('Target failed to be extracted. Maybe you got the target column name wrong.')
    
    
    
    def clean(self):
        
        nuniques = self.X.apply(lambda x: x.nunique()/len(x))
        cols = self.X.dtypes.loc[(self.X.dtypes == 'object') & (nuniques < 0.5)].index
        others = self.X.dtypes.loc[(self.X.dtypes != 'object') & (nuniques < 0.5)].index

        self.X = pd.concat([self.X[others], pd.get_dummies(self.X[cols], prefix_sep = ' = ')], 1)
        
        self.X_shape = self.X.shape
        self.numerics = self.X[(self.X.dtypes != 'O').index[self.X.dtypes != 'O']]
        self.objects = self.X[(self.X.dtypes == 'O').index[self.X.dtypes == 'O']]
        
        print('\nCleaned successfully\nX = {} rows and {} columns'''.format(
                    self.X_shape[0], self.X_shape[1]))
        
        
        
    def train(self):
        if self.classification == 1:
            
            self.x_train, self.x_test, self.y_train, self.y_test = \
                        holdout(self.X, self.clean_Y, test_size = self.holdout)
            
            self.model.fit(self.x_train, self.y_train)
            print('\n{} trained successfully'.format(self.model_name))
            print('''Classification Report
---------------------------------------------
Training accuracy = {}
Testing accuracy = {}'''.format(round(acc(self.y_train, self.model.predict(self.x_train)),3),
                 round(acc(self.y_test, self.model.predict(self.x_test)),3)))
            
    
    def predict(self, new_X):
        predictions = pd.Series(self.model.predict(new_X))
        predictions = predictions.replace(self.mapping)
        return predictions
    
    
    def impact(self):
        impacts = pd.DataFrame(self.model.coef_)
        impacts.columns = self.x_train.columns
        impacts = impacts.T
        impacts.columns = ['Impact']
        impacts['Absolute Impact'] = np.abs(impacts['Impact'])
        
        impacts.sort_values(by = 'Absolute Impact', ascending = False, inplace = True)
        
        unqs = [self.x_train[x].unique() for x in self.x_train.columns]
        unqs = [list(x[0:10])+['...'] if len(x) > 10 else x for x in unqs]
        
        impacts['Description'] = unqs
        return impacts

In [344]:
mark = MARK()
mark.holdout = 50
mark.read_csv('WA_Fn-UseC_-Telco-Customer-Churn')
mark.classify('Churn')
mark.clean()
mark.train()
mark.predict(mark.x_train)
mark.impact()

Data Source = WA_Fn-UseC_-Telco-Customer-Churn

X = 7043 rows and 21 columns

Target = Churn
7043 rows and 2 unique values ['No', 'Yes']

Cleaned successfully
X = 7043 rows and 44 columns

LogisticRegression trained successfully
Classification Report
---------------------------------------------
Training accuracy = 0.804
Testing accuracy = 0.8


Unnamed: 0,Impact,Absolute Impact,Description
Contract = Two year,-0.704804,0.704804,"[0, 1]"
Contract = Month-to-month,0.650931,0.650931,"[20, 22, 40, 31, 68, 66, 61, 37, 70, 6, ...]"
InternetService = Fiber optic,0.625406,0.625406,"[68.7, 40.05, 42.35, 83.85, 88.15, 61.35, 79.4..."
InternetService = DSL,-0.512948,0.512948,"[0, 1]"
StreamingMovies = Yes,0.243664,0.243664,"[1, 0]"
PaymentMethod = Electronic check,0.242916,0.242916,"[1, 0]"
StreamingTV = Yes,0.237766,0.237766,"[0, 1]"
SeniorCitizen,0.219354,0.219354,"[1, 0]"
OnlineSecurity = No,0.215327,0.215327,"[0, 1]"
MultipleLines = No,-0.208481,0.208481,"[0, 1]"


In [366]:
from itertools import combinations, chain

In [371]:
i = 5
for col in (chain.from_iterable([list(combinations(list(mark.impacts.index[0:i]), x)) for x in range(1,i+1) ])):
    mark.x_train[list(col)]

In [372]:
mark.x_train[list(col)]

Unnamed: 0,Contract = Two year,Contract = Month-to-month,InternetService = Fiber optic,InternetService = DSL,StreamingMovies = Yes
5925,0,0,0,1,0
6235,0,1,0,1,1
2127,0,0,0,1,0
3464,0,1,1,0,1
337,1,0,0,1,1
5117,1,0,0,1,1
3072,0,0,0,1,1
4174,0,1,0,1,0
6528,1,0,0,0,0
508,0,1,1,0,0


In [59]:
data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
y = data.pop('Churn')
mapping = {}

if y.dtype == 'O':
    y = y.astype('category')
    mapping = dict(enumerate(y.cat.categories))
    y = y.cat.codes
    
nuniques = data.apply(lambda x: x.nunique()/len(x))
cols = data.dtypes.loc[(data.dtypes == 'object') & (nuniques < 0.5)].index
others = data.dtypes.loc[(data.dtypes != 'object') & (nuniques < 0.5)].index

data = pd.concat([data[others], pd.get_dummies(data[cols], prefix_sep = ' = ')], 1)
x_train, x_test, y_train, y_test = holdout(data, y, test_size = 0.2)

In [63]:
model = LogisticRegression()
model.fit(x_train, y_train)
prediction = model.predict(x_test)
acc(y_test, prediction)

0.8019872249822569

In [67]:
pd.Series(prediction).replace(mapping)

0       Yes
1        No
2        No
3        No
4        No
5       Yes
6        No
7        No
8        No
9        No
10       No
11       No
12       No
13      Yes
14       No
15       No
16       No
17       No
18      Yes
19       No
20      Yes
21       No
22      Yes
23       No
24       No
25      Yes
26       No
27      Yes
28       No
29      Yes
       ... 
1379    Yes
1380     No
1381     No
1382     No
1383     No
1384    Yes
1385     No
1386     No
1387    Yes
1388     No
1389     No
1390     No
1391     No
1392     No
1393     No
1394    Yes
1395     No
1396     No
1397     No
1398     No
1399     No
1400     No
1401     No
1402     No
1403     No
1404    Yes
1405     No
1406    Yes
1407     No
1408    Yes
Length: 1409, dtype: object