# Install and Set Up Kaggle and API Key

Follow these steps to install and configure the Kaggle API on your system:

1. **Create a Kaggle Account**
   - Visit [Kaggle](https://www.kaggle.com) and sign up for an account.

2. **Obtain Kaggle API Key**
   - Go to your Kaggle account settings.
   - Find the "API" section and click on "Create New API Token".
   - This will download a `kaggle.json` file containing your API key.

3. **Install Kaggle Package**
   - Use Conda to install the Kaggle package by running:
     ```bash
     conda install kaggle
     ```

4. **Configure API Key**
   - Copy the `kaggle.json` file to your user directory under the `.kaggle` folder. On most systems, you can use the following command:
     ```bash
     mkdir -p ~/.kaggle
     cp path_to_downloaded_kaggle.json ~/.kaggle/kaggle.json
     chmod 600 ~/.kaggle/kaggle.json
     ```
   - Ensure the `.kaggle` directory and the `kaggle.json` file have the proper permissions by setting:
     ```bash
     chmod 600 ~/.kaggle/kaggle.json
     ```


In [1]:
import pandas as pd
import kaggle
import Marketing_Campaign as mc
# Pre processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
import numpy as np


# Scoring 
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
# models 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
#potting
import matplotlib.pyplot as plt



In [2]:
# Get the data using an API call
kaggle.api.dataset_download_files('rodsaldanha/arketing-campaign', path='resources', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/rodsaldanha/arketing-campaign


In [3]:
# Import the data
data = pd.read_csv("./resources/marketing_campaign.csv",delimiter=';')


# EDA (Exploratory Data Analysis)
We will revisit this. For now We want the rough draft of the model
#
During EDA

Visualize the data using plots and graphs to understand distributions and relationships between variables.
Calculate summary statistics to get a sense of the central tendencies and variability.
Identify any correlations between variables that might influence model choices.
Detect and treat missing values or outliers that could skew the results of your analysis.
Explore the data's structure to inform feature selection and engineering, which are key to building effective machine learning models.

# read any and all documentation you can find on your dataset to understand it better


In [4]:
display (data.head())
# what does our data look like? At this point also use any documentation on the data set to find out what each value means and how it might be used is solving the business problem
print (f'{data.shape}\n')


Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,5,0,0,0,0,0,0,3,11,0


(2240, 29)



In [5]:
# Preprocess = Clean up NA if number of NA in column is less that a percentage of rows
# this automatically cleans up rows below a threshold and list columns when NA rows exceed the threshold
data1=mc.auto_drop_na(data,2)

Drop Percent of the rows is %2
If the number of NA values in a column is less than the calculated threshold, automatically drop the NA rows.
{'Income': 24}
Automatically dropping rows in Income where NA values are present.


In [6]:
# Identify non numeric columns we will need to deal with
non_numeric= (data1.dtypes[(data1.dtypes != 'int64') & (data1.dtypes != 'float64')]).index.tolist()
# display (data.dtypes)
print (f'Columns that are not numeric :\n {non_numeric}\n')

for column in non_numeric:
    print (data1[column].value_counts())
    print ('\n')

Columns that are not numeric :
 ['Education', 'Marital_Status', 'Dt_Customer']

Education
Graduation    1116
PhD            481
Master         365
2n Cycle       200
Basic           54
Name: count, dtype: int64


Marital_Status
Married     857
Together    573
Single      471
Divorced    232
Widow        76
Alone         3
Absurd        2
YOLO          2
Name: count, dtype: int64


Dt_Customer
2012-08-31    12
2012-09-12    11
2013-02-14    11
2014-05-12    11
2013-08-20    10
              ..
2012-08-05     1
2012-11-18     1
2014-05-09     1
2013-06-26     1
2014-01-09     1
Name: count, Length: 662, dtype: int64




# Numeric data to group
The year of birth data creates too many unique data to be useful. We have chosen to convert the Year Birth data to a 6 generation values. After one hot encode for generation we drop the year birth column. 

# non numeric column 
- Education OrdinalEncoder because education counts
     - 0 - **Basic** This generally refers to elementary or primary education.
     - 1 - **2n Cycle** This is not a commonly used term globally but might refer to secondary education or an intermediary level in some education systems.
     - 2 - **Graduation** Typically refers to the completion of a bachelor's or undergraduate degree.
     - 3 - **Master** A postgraduate degree that follows the completion of a bachelor's degree.
     - 4 - **PhD** The highest university degree, typically following a master's degree.

- Marital_Status - The status has no weighted values. change none standard answers to single and then one hot encode three remaining values
     - Alone     Single
     - Absurd    Single
     - YOLO      Single

- Year_Birth - Convert to 6 generations and then One Hot Encode

- Dt_Customer - We will convert to data time and represent this in number of months the cusomter has been with us

# ID column
The id column is etiher a uniwue customer id or an index falue that became a column in the past

In [7]:
# Education encode
categories = ['Basic','2n Cycle','Graduation','Master','PhD']
column = 'Education'
data2 = mc.preprocess_ord(data1,column,categories)

In [8]:
# Marital Status Encode
data2['Marital_Status'].replace(['Alone', 'Absurd', 'YOLO'], 'single',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data2['Marital_Status'].replace(['Alone', 'Absurd', 'YOLO'], 'single',inplace=True)


In [9]:
# transform Year_Birth to Generations
data3 = mc.set_gen(data2,'Year_Birth')
print (data3.columns)

Index(['ID', 'Year_Birth', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome',
       'Dt_Customer', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts',
       'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
       'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
       'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3',
       'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2',
       'Complain', 'Z_CostContact', 'Z_Revenue', 'Response', 'Education',
       'Generations'],
      dtype='object')


In [15]:
data4 = mc.date_to_months(data3,'Dt_Customer',2024)

print (data4['Dt_Customer'].value_counts())

Dt_Customer
137    117
123    115
116    111
118    108
132    107
125    102
128    102
130    101
131     99
126     98
136     97
134     96
135     95
124     94
117     94
121     92
127     89
120     88
122     87
129     87
133     83
119     78
115     74
138      2
Name: count, dtype: int64


In [11]:
ohe_column_list = ['Marital_Status', 'Generations']

In [12]:
data5 = mc.preprocess_ohe(data4,ohe_column_list)

In [13]:
data5.head()

Unnamed: 0,ID,Year_Birth,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,...,Marital_Status_Single,Marital_Status_Together,Marital_Status_Widow,Marital_Status_single,Generations_Baby Boomers,Generations_Generation X,Generations_Lost Generation,Generations_Millennials,Generations_Silent Generation,Generations_Unknown Generation
0,5524,1957,58138.0,0,0,136,58,635,88,546,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2174,1954,46344.0,1,1,118,38,11,1,6,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,4141,1965,71613.0,0,0,125,26,426,49,127,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,6182,1984,26646.0,1,0,119,26,11,4,20,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,5324,1981,58293.0,1,0,120,94,173,43,118,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [20]:
# Identify columns with only one unique value
single_value_columns = data5.nunique() == 1

# Drop these columns from the DataFrame
data5 = data5.loc[:, ~single_value_columns]

data6 = data5.drop(['ID','Year_Birth'],axis=1)

print (data6.head())

Index(['ID', 'Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Dt_Customer',
       'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts',
       'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
       'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
       'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3',
       'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2',
       'Complain', 'Response', 'Education', 'Marital_Status_Divorced',
       'Marital_Status_Married', 'Marital_Status_Single',
       'Marital_Status_Together', 'Marital_Status_Widow',
       'Marital_Status_single', 'Generations_Baby Boomers',
       'Generations_Generation X', 'Generations_Lost Generation',
       'Generations_Millennials', 'Generations_Silent Generation',
       'Generations_Unknown Generation'],
      dtype='object')
Index(['Income', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency', 'MntWines',
       'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProduc

# Check for imbalanced Y value

In [None]:
# look for balance of our data
df["Response"].value_counts()

In [None]:
df.describe()

In [None]:
df["Education"].value_counts()

In [None]:
pd.set_option('display.max_rows', 500)


In [None]:
# Preprocess the training data
df_proc=X_preprocess(df)
display (df_proc)
# df
display (df_proc['Education'].value_counts())

# this section will address the data imbalance we see in our y value. 


In [None]:
from imblearn.over_sampling import RandomOverSampler

# Applying RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Check the new class distribution
print(y_resampled.value_counts())

In [None]:
from imblearn.under_sampling import RandomUnderSampler

# Applying RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Check the new class distribution
print(y_resampled.value_counts())

In [None]:
from imblearn.over_sampling import SMOTE

# Applying SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the new class distribution
print(y_resampled.value_counts())
# Split the resampled data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=42)

# Train a RandomForestClassifier with class weights
rfc = RandomForestClassifier(class_weight='balanced', random_state=42)
rfc.fit(X_train, y_train)

# Predict and evaluate
y_pred = rfc.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from imblearn.combine import SMOTEENN

# Applying SMOTEENN (combination of SMOTE and Edited Nearest Neighbors)
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# Split the resampled data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=42)

# Train and evaluate the model
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# Scale the X data by using StandardScaler()
scaler_ss = StandardScaler().fit(X_train)
X_train_ss_scaled = scaler_ss.transform(X_train)
display (X_train_ss_scaled)

# Transform the test dataset based on the fit from the training dataset
X_test_ss_scaled = scaler_ss.transform(X_test)
display (X_test_ss_scaled)

In [None]:
# now lets look at min max scaler
scaler_mm = MinMaxScaler().fit(X_train)
X_train_mm_scaled = scaler_mm.transform(X_train)
display (X_train_mm_scaled)
#
X_test_mm_scaled = scaler_mm.transform(X_test)
display (X_test_mm_scaled)

X_test_mm_scaled = scaler_mm.transform(X_test)
display (X_test_mm_scaled)

In [None]:
# Use Logistic model to find out what scaler works best

# Create a `LogisticRegression` function and assign it 
# to a variable named `logistic_regression_model`.
logistic_regression_model_ss = LogisticRegression()
logistic_regression_model_ss.fit(X_train_ss_scaled, y_train)
#
logistic_regression_model_mm = LogisticRegression()
logistic_regression_model_mm.fit(X_train_mm_scaled, y_train)
# Score the Logistic model

print(f"Standard Scaler\nTraining Data Score: {logistic_regression_model_ss.score(X_train_ss_scaled, y_train)}")
print(f"Testing Data Score: {logistic_regression_model_ss.score(X_test_ss_scaled, y_test)}")
print(f"Min Max Scaler\nTraining Data Score: {logistic_regression_model_mm.score(X_train_mm_scaled, y_train)}")
print(f"Testing Data Score: {logistic_regression_model_mm.score(X_test_mm_scaled, y_test)}")

# Test models
    -RANDOM FOREST MODEL
    -Decision Tree Model

# **RANDOM FOREST MODEL


In [None]:
# Create and train the model
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = random_forest_model.predict(X_test_ss_scaled)
# Calculate precision, recall, F1 score
# Cross-validation scores
cv_scores = cross_val_score(random_forest_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')
display (random_forest_model)


# Decision Tree Model


In [None]:
# Create and train the model
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = decision_tree_model.predict(X_test_ss_scaled)
# Cross-validation scores
cv_scores = cross_val_score(decision_tree_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')


In [None]:
# Test models
# -RANDOM FOREST MODEL
# Score the model
print(f"Random Forest - Training Data Score: {random_forest_model.score(X_train_ss_scaled, y_train)}")
print(f"Random Forest - Testing Data Score: {random_forest_model.score(X_test_ss_scaled, y_test)}")
print(f"Random Forest - Precision: {precision_score(y_test, y_pred)}")
print(f"Random Forest - Recall: {recall_score(y_test, y_pred)}")
print(f"Random Forest - F1 Score: {f1_score(y_test, y_pred)}")
print(f"Random Forest - Cross-Validation Accuracy: {cv_scores.mean()}")


In [None]:
y_test = np.squeeze(y_test)
y_pred = np.squeeze(y_pred)

# Compare actual and predicted responses
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Sample some data to plot
sampled_data = comparison_df.sample(50, random_state=42)
sampled_data.plot(kind='bar', figsize=(14, 8))
plt.title('Comparison of Actual and Predicted Responses')
plt.show()

In [None]:
# Scatter plot for random forest
# 

import matplotlib.pyplot as plt
import numpy as np

features = X_train.columns
importances = model.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(10, 6))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
