In [1]:
# Base libraries
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import BASE_COLORS
%matplotlib inline

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
customer_df = pd.read_csv('files_for_lab/we_fn_use_c_marketing_customer_value_analysis.csv')
customer_df.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,Location Code,Marital Status,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,Suburban,Married,69,32,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,Suburban,Single,94,13,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,Suburban,Married,108,18,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,Suburban,Married,106,18,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,Rural,Single,73,12,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


## EDA

In [None]:
customer_df.shape

In [None]:
customer_df.info()

In [None]:
customer_df.describe()

In [None]:
#change column names
new_cols = [col.replace(" ", "_").lower() for col in customer_df]
customer_df.columns = new_cols

In [None]:
# date column to datetime format
customer_df['effective_to_date'] = pd.to_datetime(customer_df['effective_to_date'])

In [None]:
# Check nulls
customer_df.isna().sum()/len(customer_df)

In [None]:
customer_df.head()

# Break into Numerical and Categorical

In [None]:
categorical=customer_df.select_dtypes(include=object)
numerical=customer_df.select_dtypes(include=np.number)

In [None]:
def plot_num(Dataframe):
    for column in Dataframe.columns:
        sns.distplot(a=Dataframe[column])
        plt.show()
    return None
   

plot_num(numerical)

## Checking for outliers

In [None]:
def outlier(Dataframe):
    for column in Dataframe.columns:
        sns.boxplot(y=Dataframe[column])
        plt.show()
    return None
   

outlier(numerical)

### Columns with outliers

* customer_lifetime_value 
* monthly_premium_auto 
* total_claim_amount

In [None]:
numerical.describe().T

In [None]:
numerical.columns

In [None]:
def value_count(df):
    for columns in df:
            print (columns,':     ',len(numerical[columns].value_counts()))
    return

value_count(numerical)
        
#print(len(numerical.customer_lifetime_value.value_counts()))        

In [None]:
print(len(numerical.customer_lifetime_value.value_counts()))
print(len(numerical.income.value_counts()))
print(len(numerical.monthly_premium_auto.value_counts()))
print(len(numerical.months_since_last_claim.value_counts()))
print(len(numerical.months_since_policy_inception.value_counts()))
print(len(numerical.number_of_open_complaints.value_counts()))
print(len(numerical.number_of_policies.value_counts()))
print(len(numerical.total_claim_amount.value_counts()))

In [None]:
print('Old Shape', numerical.shape)

In [None]:
def outliers(column, threshold = 3):
    """
    docs
    """
    data = column[abs(column.apply(lambda x: (x - column.mean()) / column.var() ** (1/2))) > threshold]
    return data

In [None]:
CLV_outliers = outliers(numerical["customer_lifetime_value"])
MPA_outliers = outliers(numerical["monthly_premium_auto"])
TCA_outliers = outliers(numerical["total_claim_amount"])

In [None]:
to_drop = CLV_outliers.index | MPA_outliers.index | TCA_outliers.index  # Union

In [None]:
clean_numerical = numerical.drop(to_drop).reset_index(drop = True)
clean_numerical.isna().sum()

In [None]:
categorical = categorical.drop(to_drop).reset_index(drop = True)

In [None]:
 print("New Shape: ", clean_numerical.shape)

## Looking at Categorical features

In [None]:
categorical.head()

In [None]:
# customer has all unique values, dropping before I plot
categorical= categorical.drop(['customer'], axis = 1)

In [None]:
def plot_cat(Dataframe):
    for column in Dataframe.columns:
        sns.countplot(x=Dataframe[column])
        plt.show()
    return None
   

plot_cat(categorical)


## Insights into categorical data
column | decision
-------|---------
state | only 5 - leave as is
response | imbalanced make 1,0
coverage| ordinal encode
education | combine master & dr.
employment status | smallest 3 into other
gender | 1hot encode
location | ordinal encode as is
marital | 1hot encode as is
policy type | combine corporate/special
policy | drop column
renew offer | encode as is
sales channel | 1hot encode as is
vehicle class | 1hot combine luxury/sports
vehicle size | ordinal encode

## Finding discrete and continuous columns

In [None]:
#discrete = [i for i in numerical if (numerical[i].all() == numerical[i].apply(int).all()) \
                 #& (len(numerical[i].unique()) < (numerical.shape[0] * 0.01))]

# continuous = list(numerical.drop(columns = discrete).columns)

In [None]:
#numerical['Total Claim Amount'].apply(lambda x: x.is_integer()).sum()
#len(numerical['Total Claim Amount'])

In [None]:
# def dis_cont(df):
#     discrete = []
#     continuous = []
#     for col in df:
#         if (df[col].all() == df[col].apply(int).all()) & (len(df[col].unique()) < (df.shape[0] * 0.01)):
#             discrete.append(col)
#         else:
#             continuous.append(col)
#     return discrete, continuous
            

In [None]:
# len(numerical['Total Claim Amount'].unique())
# numerical.shape[0]*.01

In [None]:
def discrete_continuous(df):
    discrete_lst = []
    continuous_lst = []
    for col in df.columns:
        if len(df[col].unique()) < (df.shape[0] * 0.01):
            discrete_lst.append(col)
        else:
            continuous_lst.append(col)
    return (discrete_lst, continuous_lst)
   
        

In [None]:
discrete_lst, continuous_lst = discrete_continuous(clean_numerical)

In [None]:
print('Discrete List:  ',discrete_lst)
print('Cotinuous List:   ',continuous_lst)

In [None]:
num_discrete_df = numerical[['months_since_last_claim', 'number_of_open_complaints', 'number_of_policies']]
num_continuous_df = numerical[['customer_lifetime_value', 'income', 'monthly_premium_auto', 'months_since_policy_inception', 'total_claim_amount']]

# Checking if the discrete list can all be integers

In [None]:
# def is_integer_num(df):
#     for col in df:
#         for i in col:
#             if isinstance(i, int):
#                 print('int')
#             if isinstance(i, float):
#                 print('float')
#     return   


# is_integer_num(num_discrete_df)

# Checking Correlations

In [None]:
correlations = clean_numerical.corr()
correlations

In [None]:
fig, ax=plt.subplots(figsize=(10, 8))
ax=sns.heatmap(clean_numerical.corr(), annot=True)
plt.show()

## Working with categoricals

In [None]:
categorical_columns = [col for col in categorical if (len(categorical[col].unique()) < (numerical.shape[0] * 0.01))]

In [None]:
print(categorical_columns)

### Check if any columns are different

In [None]:
categorical.head()

## Dummy code for ordinal encoding
#### data["coverage"] = data["coverage"].map({"Basic" : 0, "Extended" : 1, "Premium" : 2})

In [None]:
categorical["coverage"] = categorical["coverage"].map({"Basic" : 0, "Extended" :.5, "Premium" : 1})

In [None]:
categorical['employmentstatus'].unique()

In [None]:
categorical["education"] =categorical["education"].map({"High School or Below" : 0, "Bachelor" : .5, "College":.5, "Master" : 1, "Doctor":1})

In [None]:
categorical["employmentstatus"] = categorical["employmentstatus"].map({"Medical Leave" : 0, "Disabled" : 0, "Unemployed" : 0, "Retired" : .5, "Employed":2})


In [None]:
categorical["location_code"] =categorical["location_code"].map({"Rural" : 0, "Suburban" :.5, "Urban" : 1,})

In [None]:
categorical["vehicle_size"] =categorical["vehicle_size"].map({"Small" : 0, "Medsize" :.5, "Large" : 1,})

In [None]:
categorical.head()

In [None]:
categorical.head()

In [None]:
categorical = categorical.drop(['policy'], axis=1)

In [None]:
X_all=pd.concat([clean_numerical, categorical], axis=1)
X_all.head()

### Processing for the X y split
### X=features  y=target

In [None]:
X=X_all.drop('total_claim_amount', axis=1)
y=X_all['total_claim_amount']

In [None]:
X.isna().sum()

## Train/test/split data
### Splitting / Scaling / 1Hot encoding X_train and X_test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
print('X_train shape is:',X_train.shape)
print('y_train shape is:', y_train.shape)
print('X_test shape is:', X_test.shape)
print('y_train shape is:', y_test.shape)

## Break into numerical and categorical

In [None]:
X_train_num = X_train.select_dtypes(include = np.number)
X_train_cat =X_train.select_dtypes(include = object)
X_test_num = X_test.select_dtypes(include = np.number)
X_test_cat = X_test.select_dtypes(include = object)

In [None]:
X_train_num.head()

### Initialize and fit the scaler

In [None]:
transformer = MinMaxScaler().fit(X_train_num)

In [None]:
X_norm1 = transformer.transform(X_train_num)
X_train_scale = pd.DataFrame(X_norm1, columns=X_train_num.columns)
X_train_scale.head()

In [None]:
X_norm2 = transformer.transform(X_test_num)
X_test_scale = pd.DataFrame(X_norm2, columns=X_test_num.columns)
X_test_scale.head()

In [None]:
X_train_cat.head(1)

## 1-hot encoding the train and test

In [None]:
encoder = OneHotEncoder(drop='first').fit(X_train_cat)

cols = encoder.get_feature_names(input_features=X_train_cat.columns)

X_train_cat_encode = pd.DataFrame(encoder.transform(X_train_cat).toarray(),columns=cols)

X_train_cat_encode.head()


In [None]:
cols = encoder.get_feature_names(input_features=X_test_cat.columns)

X_test_cat_encode = pd.DataFrame(encoder.transform(X_test_cat).toarray(),columns=cols)

X_test_cat_encode.head()


## Concatinating the scaled numericals and the encoded categories

In [None]:
X_train = pd.concat([X_train_cat_encode, X_train_scale], axis = 1)
X_test = pd.concat([X_test_cat_encode, X_test_scale], axis = 1)

In [None]:
print('X_train shape is:',X_train.shape)
print('y_train shape is:', y_train.shape)
print('X_test shape is:', X_test.shape)
print('y_train shape is:', y_test.shape)

##  Models

In [None]:
# Define function to run all models
def models_automation(models, X_train, y_train):
    for model in models:
        model.fit(X_train, y_train)
        print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")


In [None]:
model_list = [LinearRegression(),SGDRegressor(),KNeighborsRegressor(), MLPRegressor(),DecisionTreeRegressor(),RandomForestRegressor()]
models_automation(model_list, X_train, y_train)



### It looks like the Random Forest Regressor had the best results

In [None]:
# from sklearn import model_selection
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier

In [None]:
# prepare models
# models = []
# models.append(('LR', LogisticRegression()))
# models.append(('KNN', KNeighborsClassifier()))
# models.append(('CART', DecisionTreeClassifier()))
# models.append(('FOREST',RandomForestRegressor))
# seed = 7
# # evaluate each model in turn
# results = []
# names = []
# scoring = 'accuracy'
# for name, model in models:
# 	kfold = model_selection.KFold(n_splits=10,)
# 	cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
# 	results.append(cv_results)
# 	names.append(name)
# 	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
# 	print(msg)



In [None]:
#https://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/

In [None]:
# # boxplot algorithm comparison
# fig = plt.figure()
# fig.suptitle('Algorithm Comparison')
# ax = fig.add_subplot(111)
# plt.boxplot(results)
# ax.set_xticklabels(names)
# plt.show()


In [None]:
# LR = LinearRegression()
# LR.fit(X_train, y_train)
# print('training set score:{:3f}'.format(LR.score(X_train,y_train)))
# print('test set score:{:3f}'.format(LR.score(X_test,y_test)))

In [None]:
#from sklearn.metrics import confusion_matrix

In [None]:
#prediction = LinearRegression.predict(X_test)

In [None]:
#confusion_matrix(y_test, prediction)