In [2]:
def round2_cleaning(input_data):
    
    # create a copy of the original dataset, to avoid changing the raw data
    output_data = input_data.copy()

    # harmonize feaders
    output_data = output_data.rename(columns={'EmploymentStatus': 'Employment Status'})
    
    # delete useless data
    output_data = output_data.drop(['Unnamed: 0', 'Vehicle Type'], axis=1)
    output_data = output_data.dropna()
    
    # change format of the date and add the extra column with the month
    output_data['Effective To Date'] = pd.to_datetime(output_data['Effective To Date'], errors='coerce')
    output_data['Effective To Month'] = output_data['Effective To Date'].apply(lambda x: x.month)
    
    # filter the data
    output_data = output_data[output_data['Effective To Month'] <=3 ]
    
    # output of the function
    return output_data

In [None]:
# def round7_modeling(input_data):
    
#     # We start by cleaning the data, calling the function created at the end of Round 2
#     cleaned_input = round2_cleaning(input_data)
    
#     # Data processing
#     import pandas as pd
#     import numpy as np
#     # Split numerical from categoricals
#     numericals = cleaned_input._get_numeric_data()
#     categoricals = cleaned_input.select_dtypes(['object']).drop(['Customer'], axis=1)
    
#     # Define target and features
#     y = cleaned_input['Total Claim Amount']
#     X = numericals.drop(['Total Claim Amount'], axis = 1)
    
#     # Standardize the features
#     from sklearn.preprocessing import StandardScaler
#     transformer = StandardScaler().fit(X)
#     X_standardized = transformer.transform(X)
#     X_standardized = pd.DataFrame(X_standardized, columns=X.columns)
    
#     # Standardize the categoricals
#     from sklearn.preprocessing import OneHotEncoder
#     cat_onehot = categoricals[['State', 'Response', 'Employment Status', 'Gender', 'Location Code', 'Marital Status', 'Policy Type', 'Policy', 'Sales Channel', 'Vehicle Class']].copy()
#     encoder = OneHotEncoder().fit(cat_onehot)
#     cols = [colname for row in encoder.categories_ for colname in row]
#     encoded = encoder.transform(cat_onehot).toarray()
#     cat_onehot_encoded = pd.DataFrame(encoded,columns=cols)
#     cols_to_drop = [row[0] for row in encoder.categories_]
#     cat_onehot_encoded = cat_onehot_encoded.drop(cols_to_drop, axis=1)
    
#     cat_label = categoricals[['Coverage', 'Renew Offer Type', 'Education', 'Vehicle Size']].copy()
#     coverage_mapper = { 'Basic':1, 'Extended':2, 'Premium':3 }
#     education_mapper = { 'High School or Below':1, 'College':2, 'Bachelor':3, 'Master':4, 'Doctor':5 }
#     veh_size_mapper = { 'Small':1, 'Medsize':2, 'Large':3 }

#     cat_label['Coverage'] = cat_label['Coverage'].replace(coverage_mapper)
#     cat_label['Education'] = cat_label['Education'].replace(education_mapper)
#     cat_label['Vehicle Size'] = cat_label['Vehicle Size'].replace(veh_size_mapper)
#     cat_label['Renew Offer Type'] = cat_label['Renew Offer Type'].str[-1:]
#     categoricals_encoded = pd.concat([cat_onehot_encoded, cat_label], axis=1)
#     X_complete = pd.concat([X_standardized, categoricals_encoded], axis=1)
    
#     # Linear regression
#     from sklearn.model_selection import train_test_split
#     X_train, X_test, y_train, y_test = train_test_split(X_complete, y, test_size=0.2, random_state=42)
    
#     from sklearn import linear_model
#     lm = linear_model.LinearRegression()
#     lm.fit(X_train,y_train)
    
    
#     from sklearn.metrics import r2_score
#     predictions_train = lm.predict(X_train)
#     R2_train = r2_score(y_train, predictions_train)
#     predictions_test = lm.predict(X_test)
#     R2_test = r2_score(y_test, predictions_test)
#     from sklearn.metrics import mean_squared_error
#     mse_train = np.sqrt(mean_squared_error(y_train,predictions_train))
#     mse_test = np.sqrt(mean_squared_error(y_test,predictions_test))
    
#     import math
#     rmse_train = math.sqrt(mse_train)
#     rmse_test = math.sqrt(mse_test)
    
#     from sklearn.metrics import mean_absolute_error
#     mae_train = mean_absolute_error(y_train, predictions_train)
#     mae_test = mean_absolute_error(y_test, predictions_test)
    
#     validation_matrix = { 'Indicator':['R2','mse', 'rmse', 'mae'],
#                  'Train Set': [R2_train, mse_train, rmse_train, mae_train],
#                  'Test Set': [R2_test, mse_test, rmse_test, mae_test]
#                 } 
    
#     return validation_matrix

In [None]:
import pandas as pd
input_data = pd.read_csv('marketing_customer_analysis.csv')

In [None]:
# output_matrix = round7_modeling(data)
# output_matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# We start by cleaning the data, calling the function created at the end of Round 2
cleaned_input = round2_cleaning(input_data)
    
# Data processing
import pandas as pd
import numpy as np

    # Exclude outliers from the data set for "Customer Lifetime Value" and "Monthly Premium Auto"
iqr_lifetime = np.percentile(cleaned_input['Customer Lifetime Value'],75) - np.percentile(cleaned_input['Customer Lifetime Value'],25)
upper_limit_lifetime = np.percentile(cleaned_input['Customer Lifetime Value'],75) + 1.5*iqr_lifetime
lower_limit_lifetime = np.percentile(cleaned_input['Customer Lifetime Value'],25) - 1.5*iqr_lifetime
cleaned_input = cleaned_input[(cleaned_input['Customer Lifetime Value']>lower_limit_lifetime) & (cleaned_input['Customer Lifetime Value']<upper_limit_lifetime)]

iqr_premium = np.percentile(cleaned_input['Monthly Premium Auto'],75) - np.percentile(cleaned_input['Monthly Premium Auto'],25)
upper_limit_premium = np.percentile(cleaned_input['Monthly Premium Auto'],75) + 1.5*iqr_premium
lower_limit_premium = np.percentile(cleaned_input['Monthly Premium Auto'],25) - 1.5*iqr_premium
cleaned_input = cleaned_input[(cleaned_input['Monthly Premium Auto']>lower_limit_premium) & (cleaned_input['Monthly Premium Auto']<upper_limit_premium)]  

sns.displot(cleaned_input['Customer Lifetime Value'])
plt.show()
sns.displot(cleaned_input['Monthly Premium Auto'])
plt.show()


# Split numerical from categoricals
numericals = cleaned_input._get_numeric_data()
categoricals = cleaned_input.select_dtypes(['object']).drop(['Customer'], axis=1)
    
    
    
    
    # Define target and features
y = cleaned_input['Total Claim Amount']
X = numericals.drop(['Total Claim Amount'], axis = 1)
    
    # Standardize the features
from sklearn.preprocessing import StandardScaler
transformer = StandardScaler().fit(X)
X_standardized = transformer.transform(X)
X_standardized = pd.DataFrame(X_standardized, columns=X.columns)
    
     # Standardize the categoricals
from sklearn.preprocessing import OneHotEncoder
cat_onehot = categoricals[['State', 'Response', 'Employment Status', 'Gender', 'Location Code', 'Marital Status', 'Policy Type', 'Policy', 'Sales Channel', 'Vehicle Class']].copy()
cat_onehot.reset_index(drop=True)
encoder = OneHotEncoder().fit(cat_onehot)
cols = [colname for row in encoder.categories_ for colname in row]
encoded = encoder.transform(cat_onehot).toarray()
cat_onehot_encoded = pd.DataFrame(encoded,columns=cols)
cols_to_drop = [row[0] for row in encoder.categories_]
cat_onehot_encoded = cat_onehot_encoded.drop(cols_to_drop, axis=1)
    
cat_label = categoricals[['Coverage', 'Renew Offer Type', 'Education', 'Vehicle Size']].copy()
coverage_mapper = { 'Basic':1, 'Extended':2, 'Premium':3 }
education_mapper = { 'High School or Below':1, 'College':2, 'Bachelor':3, 'Master':4, 'Doctor':5 }
veh_size_mapper = { 'Small':1, 'Medsize':2, 'Large':3 }

cat_label['Coverage'] = cat_label['Coverage'].replace(coverage_mapper)
cat_label['Education'] = cat_label['Education'].replace(education_mapper)
cat_label['Vehicle Size'] = cat_label['Vehicle Size'].replace(veh_size_mapper)
cat_label['Renew Offer Type'] = cat_label['Renew Offer Type'].str[-1:]
cat_label.reset_index(drop=True, inplace=True)
cat_onehot_encoded.reset_index(drop=True, inplace=True)
categoricals_encoded = pd.concat([cat_onehot_encoded, cat_label], axis=1)
X_complete = pd.concat([X_standardized, categoricals_encoded], axis=1)

    
# Linear regression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_complete, y, test_size=0.2, random_state=42)
    
from sklearn import linear_model
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)
    
    
from sklearn.metrics import r2_score
predictions_train = lm.predict(X_train)
R2_train = r2_score(y_train, predictions_train)
predictions_test = lm.predict(X_test)
R2_test = r2_score(y_test, predictions_test)
from sklearn.metrics import mean_squared_error
mse_train = np.sqrt(mean_squared_error(y_train,predictions_train))
mse_test = np.sqrt(mean_squared_error(y_test,predictions_test))
    
import math
rmse_train = math.sqrt(mse_train)
rmse_test = math.sqrt(mse_test)
    
from sklearn.metrics import mean_absolute_error
mae_train = mean_absolute_error(y_train, predictions_train)
mae_test = mean_absolute_error(y_test, predictions_test)
    
# validation_matrix = { 'Indicator':['R2','mse', 'rmse', 'mae'],
#              'Train Set': [R2_train, mse_train, rmse_train, mae_train],
#              'Test Set': [R2_test, mse_test, rmse_test, mae_test]
#             } 

# validation_matrix = pd.DataFrame(validation_matrix)

validation_matrix = { 'Train Set': [R2_train, mse_train, rmse_train, mae_train],
                 'Test Set': [R2_test, mse_test, rmse_test, mae_test]
                } 
validation_matrix = pd.DataFrame(validation_matrix, index=['R2','mse', 'rmse', 'mae'])



In [None]:
validation_matrix

In [None]:
control_data = pd.read_csv('marketing_customer_analysis.csv')

sns.displot(control_data['Customer Lifetime Value'])
plt.show()
sns.displot(control_data['Monthly Premium Auto'])
plt.show()



In [5]:
import pandas as pd
input_data = pd.read_csv('marketing_customer_analysis.csv')

In [36]:

    # We start by cleaning the data, calling the function created at the end of Round 2
cleaned_input = round2_cleaning(input_data)
    
    # Data processing
import pandas as pd
import numpy as np
    
    # Exclude outliers from the data set for "Customer Lifetime Value" and "Monthly Premium Auto"
numericals = cleaned_input._get_numeric_data()

print(numericals.columns)

for col in numericals.columns:
        print(cleaned_input[col].head())
        print(cleaned_input[col].sum())
        iqr_col = np.percentile(cleaned_input[col], 75) - np.percentile(cleaned_input[col], 25)
        upper_limit = np.percentile(cleaned_input[col], 75) + 1.5 * iqr_col
        lower_limit = np.percentile(cleaned_input[col], 25) - 1.5 * iqr_col
        cleaned_input = cleaned_input[(cleaned_input[col]>lower_limit)&(cleaned_input[col]<upper_limit)]
    
    # iqr_lifetime = np.percentile(cleaned_input['Customer Lifetime Value'],75) - np.percentile(cleaned_input['Customer Lifetime Value'],25)
# upper_limit_lifetime = np.percentile(cleaned_input['Customer Lifetime Value'],75) + 1.5*iqr_lifetime
# lower_limit_lifetime = np.percentile(cleaned_input['Customer Lifetime Value'],25) - 1.5*iqr_lifetime
# cleaned_input = cleaned_input[(cleaned_input['Customer Lifetime Value']>lower_limit_lifetime) & (cleaned_input['Customer Lifetime Value']<upper_limit_lifetime)]

# iqr_premium = np.percentile(cleaned_input['Monthly Premium Auto'],75) - np.percentile(cleaned_input['Monthly Premium Auto'],25)
# upper_limit_premium = np.percentile(cleaned_input['Monthly Premium Auto'],75) + 1.5*iqr_premium
# lower_limit_premium = np.percentile(cleaned_input['Monthly Premium Auto'],75) - 1.5*iqr_premium    
# cleaned_input = cleaned_input[(cleaned_input['Monthly Premium Auto']>lower_limit_premium) & (cleaned_input['Monthly Premium Auto']<upper_limit_premium)]  

cleaned_input = cleaned_input.reset_index(drop=True, inplace=False)
    
    # Split numerical from categoricals
numericals = cleaned_input._get_numeric_data()
categoricals = cleaned_input.select_dtypes(['object']).drop(['Customer'], axis=1)
    
    # Define target and features
y = cleaned_input['Total Claim Amount']
X = numericals.drop(['Total Claim Amount', 'Income', 'Months Since Last Claim', 'Months Since Policy Inception', 'Number of Open Complaints', 'Number of Policies'], axis = 1)
    
    # Standardize the features
from sklearn.preprocessing import StandardScaler
transformer = StandardScaler().fit(X)
X_standardized = transformer.transform(X)
X_standardized = pd.DataFrame(X_standardized, columns=X.columns)
    
    # Standardize the categoricals
from sklearn.preprocessing import OneHotEncoder
cat_onehot = categoricals[['State', 'Response', 'Employment Status', 'Gender', 'Location Code', 'Marital Status', 'Policy Type', 'Policy', 'Sales Channel', 'Vehicle Class']].copy()
encoder = OneHotEncoder().fit(cat_onehot)
cols = [colname for row in encoder.categories_ for colname in row]
encoded = encoder.transform(cat_onehot).toarray()
cat_onehot_encoded = pd.DataFrame(encoded,columns=cols)
cols_to_drop = [row[0] for row in encoder.categories_]
cat_onehot_encoded = cat_onehot_encoded.drop(cols_to_drop, axis=1)
    
cat_label = categoricals[['Coverage', 'Renew Offer Type', 'Education', 'Vehicle Size']].copy()
coverage_mapper = { 'Basic':1, 'Extended':2, 'Premium':3 }
education_mapper = { 'High School or Below':1, 'College':2, 'Bachelor':3, 'Master':4, 'Doctor':5 }
veh_size_mapper = { 'Small':1, 'Medsize':2, 'Large':3 }

cat_label['Coverage'] = cat_label['Coverage'].replace(coverage_mapper)
cat_label['Education'] = cat_label['Education'].replace(education_mapper)
cat_label['Vehicle Size'] = cat_label['Vehicle Size'].replace(veh_size_mapper)
cat_label['Renew Offer Type'] = cat_label['Renew Offer Type'].str[-1:]
cat_label.reset_index(drop=True, inplace=True)    # used to reset the index, as the concatenate below was generating more rows than the input df had
cat_onehot_encoded.reset_index(drop=True, inplace=True)    # used to reset the index, as the concatenate below was generating more rows than the input df had
categoricals_encoded = pd.concat([cat_onehot_encoded, cat_label], axis=1)
X_complete = pd.concat([X_standardized, categoricals_encoded], axis=1)

    # Linear regression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_complete, y, test_size=0.2, random_state=42)
    
from sklearn import linear_model
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)
    
    
from sklearn.metrics import r2_score
predictions_train = lm.predict(X_train)
R2_train = r2_score(y_train, predictions_train)
predictions_test = lm.predict(X_test)
R2_test = r2_score(y_test, predictions_test)
from sklearn.metrics import mean_squared_error
mse_train = np.sqrt(mean_squared_error(y_train,predictions_train))
mse_test = np.sqrt(mean_squared_error(y_test,predictions_test))
    
import math
rmse_train = math.sqrt(mse_train)
rmse_test = math.sqrt(mse_test)
    
from sklearn.metrics import mean_absolute_error
mae_train = mean_absolute_error(y_train, predictions_train)
mae_test = mean_absolute_error(y_test, predictions_test)
    
# Creation of a dataframe as ouput (based on example found there : https://www.geeksforgeeks.org/different-ways-to-create-pandas-dataframe/)
    
validation_matrix = { 'Train Set': [R2_train, mse_train, rmse_train, mae_train],
                 'Test Set': [R2_test, mse_test, rmse_test, mae_test]
                } 
validation_matrix = pd.DataFrame(validation_matrix, index=['R2','mse', 'rmse', 'mae'])
    
# return validation_matrix

Index(['Customer Lifetime Value', 'Income', 'Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Number of Policies', 'Total Claim Amount',
       'Effective To Month'],
      dtype='object')
0     4809.216960
1     2228.525238
2    14947.917300
3    22332.439460
6     5035.035257
Name: Customer Lifetime Value, dtype: float64
73117126.298532
0    48029
1        0
2    22139
6    37405
7    87197
Name: Income, dtype: int64
313039212
0     61
1     64
2    100
6     63
7     63
Name: Monthly Premium Auto, dtype: int64
756428
0     7.0
1     3.0
2    34.0
6     8.0
7    35.0
Name: Months Since Last Claim, dtype: float64
120424.0
0    52
1    26
2    31
6    99
7    45
Name: Months Since Policy Inception, dtype: int64
384573
0    0.0
1    0.0
2    0.0
6    3.0
7    0.0
Name: Number of Open Complaints, dtype: float64
3134.0
Series([], Name: Number of Policies, dtype: int64)
0


IndexError: cannot do a non-empty take from an empty axes.

In [34]:
input_data['Number of Open Complaints'].sum()

3949.0

In [35]:
np.percentile(input_data['Number of Open Complaints'], 75)

nan

In [9]:
validation_matrix

Unnamed: 0,Train Set,Test Set
R2,0.742529,0.758946
mse,117.29285,115.429813
rmse,10.830182,10.743827
mae,84.33913,84.017166
