# Install and Set Up Kaggle and API Key

Follow these steps to install and configure the Kaggle API on your system:

1. **Create a Kaggle Account**
   - Visit [Kaggle](https://www.kaggle.com) and sign up for an account.

2. **Obtain Kaggle API Key**
   - Go to your Kaggle account settings.
   - Find the "API" section and click on "Create New API Token".
   - This will download a `kaggle.json` file containing your API key.

3. **Install Kaggle Package**
   - Use Conda to install the Kaggle package by running:
     ```bash
     conda install kaggle
     ```

4. **Configure API Key**
   - Copy the `kaggle.json` file to your user directory under the `.kaggle` folder. On most systems, you can use the following command:
     ```bash
     mkdir -p ~/.kaggle
     cp path_to_downloaded_kaggle.json ~/.kaggle/kaggle.json
     chmod 600 ~/.kaggle/kaggle.json
     ```
   - Ensure the `.kaggle` directory and the `kaggle.json` file have the proper permissions by setting:
     ```bash
     chmod 600 ~/.kaggle/kaggle.json
     ```


In [None]:
import pandas as pd
import kaggle
# Pre processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Scoring 
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
# models 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier



In [None]:
# Get the data using an API call
kaggle.api.dataset_download_files('rodsaldanha/arketing-campaign', path='resources', unzip=True)

In [None]:
# Import the data
data = pd.read_csv("./resources/marketing_campaign.csv",delimiter=';')


# EDA (Exploratory Data Analysis)
We will revisit this. For now We want the rough draft of the model
#
During EDA

Visualize the data using plots and graphs to understand distributions and relationships between variables.
Calculate summary statistics to get a sense of the central tendencies and variability.
Identify any correlations between variables that might influence model choices.
Detect and treat missing values or outliers that could skew the results of your analysis.
Explore the data's structure to inform feature selection and engineering, which are key to building effective machine learning models.

# read any and all documentation you can find on your dataset to understand it better


In [4]:
display (data.head())
# what does our data look like? At this point also use any documentation on the data set to find out what each value means and how it might be used is solving the business problem
display (data.shape)
print (f'Columns with NA valuses \n {data.isna().sum()[lambda x: x > 0]}')
# Make desision about null values. Can we fill them of should we drop rows with null values?
non_numeric= (data.dtypes[(data.dtypes != 'int64') & (data.dtypes != 'float64')]).index.tolist()
# display (data.dtypes)
print (f'Columns that are not numeric :\n {non_numeric}')
# Explore non numberic type to see how we can use them in the model

In [None]:
# just to get started we will drop NA and columns that are not numberic. this will let us get a rough model
# we come back to this and preprocess based on the draft results if needed


data_drop_columns = data.drop(columns=non_numeric, axis=1)
data_drop_na = data_drop_columns.dropna()
df = data_drop_na.copy()
df.head()



In [None]:
# Split data into Train and Test **80/20 split**
# add verbage as to why we picked response

X = df.drop('Response', axis=1)
y = df["Response"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
# This will split 'X' and 'y' such that 80% is used for training and 20% is used for testing.

# Scaling the data 
We will want to compare the scores of standard scalar to Min Max scalar to pick the bast scaling methood.

In [7]:
# Scale the X data by using StandardScaler()
scaler_ss = StandardScaler().fit(X_train)
X_train_ss_scaled = scaler_ss.transform(X_train)
X_train_ss_scaled

# Transform the test dataset based on the fit from the training dataset
X_test_ss_scaled = scaler_ss.transform(X_test)
X_test_ss_scaled

array([[ 0.28465542,  1.67814135, -1.34041544, ..., -0.0923974 ,
         0.        ,  0.        ],
       [-0.33894248, -1.62391459,  0.41722354, ..., -0.0923974 ,
         0.        ,  0.        ],
       [-1.54518097, -0.38564361, -0.37026134, ..., -0.0923974 ,
         0.        ,  0.        ],
       ...,
       [-0.19420618,  0.43987037, -1.19440407, ..., -0.0923974 ,
         0.        ,  0.        ],
       [ 1.40128059,  0.60497317,  0.55792611, ..., -0.0923974 ,
         0.        ,  0.        ],
       [ 1.63378252,  0.43987037,  0.59547522, ..., -0.0923974 ,
         0.        ,  0.        ]])

In [8]:
# now lets look at min max scaler
scaler_mm = MinMaxScaler().fit(X_train)
X_train_mm_scaled = scaler_mm.transform(X_train)
display (X_train_mm_scaled)
#
X_test_mm_scaled = scaler_mm.transform(X_test)
display (X_test_mm_scaled)

X_test_mm_scaled = scaler_mm.transform(X_test)
display (X_test_mm_scaled)

array([[0.46054866, 0.76699029, 0.05418567, ..., 0.        , 0.        ,
        0.        ],
       [0.40988294, 0.49514563, 0.06508296, ..., 0.        , 0.        ,
        0.        ],
       [0.17219194, 0.77669903, 0.1017707 , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.02198195, 0.69902913, 0.09737779, ..., 0.        , 0.        ,
        0.        ],
       [0.19444196, 0.76699029, 0.1179121 , ..., 0.        , 0.        ,
        0.        ],
       [0.96792065, 0.63106796, 0.06749522, ..., 0.        , 0.        ,
        0.        ]])

array([[0.5822536 , 0.93203883, 0.02369702, ..., 0.        , 0.        ,
        0.        ],
       [0.40130462, 0.54368932, 0.0919111 , ..., 0.        , 0.        ,
        0.        ],
       [0.05129122, 0.68932039, 0.06134876, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.44330265, 0.78640777, 0.02936373, ..., 0.        , 0.        ,
        0.        ],
       [0.90626396, 0.80582524, 0.09737178, ..., 0.        , 0.        ,
        0.        ],
       [0.97372889, 0.78640777, 0.09882906, ..., 0.        , 0.        ,
        0.        ]])

In [9]:
# Use Logistic model to find out what scaler works best

In [10]:
# Create a `LogisticRegression` function and assign it 
# to a variable named `logistic_regression_model`.
logistic_regression_model_ss = LogisticRegression()
logistic_regression_model_ss.fit(X_train_ss_scaled, y_train)
#
logistic_regression_model_mm = LogisticRegression()
logistic_regression_model_mm.fit(X_train_mm_scaled, y_train)
# Score the Logistic model

print(f"Standard Scaler\nTraining Data Score: {logistic_regression_model_ss.score(X_train_ss_scaled, y_train)}")
print(f"Testing Data Score: {logistic_regression_model_ss.score(X_test_ss_scaled, y_test)}")
print(f"Min Max Scaler\nTraining Data Score: {logistic_regression_model_mm.score(X_train_mm_scaled, y_train)}")
print(f"Testing Data Score: {logistic_regression_model_mm.score(X_test_mm_scaled, y_test)}")

Standard Scaler
Training Data Score: 0.8871331828442438
Testing Data Score: 0.8738738738738738
Min Max Scaler
Training Data Score: 0.8820541760722348
Testing Data Score: 0.8986486486486487


# Test models
    -RANDOM FOREST MODEL
    -GradientBoostingClassifier
    -KNeighborsClassifier
    -SVC (Support Vector Machine)
    -LogisticRegression
    -Decision Tree Model

# **RANDOM FOREST MODEL


In [11]:
# Create and train the model
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = random_forest_model.predict(X_test_ss_scaled)
# Calculate precision, recall, F1 score
# Cross-validation scores
cv_scores = cross_val_score(random_forest_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')
# Score the model
print(f"Random Forest - Training Data Score: {random_forest_model.score(X_train_ss_scaled, y_train)}")
print(f"Random Forest - Testing Data Score: {random_forest_model.score(X_test_ss_scaled, y_test)}")
print(f"Random Forest - Precision: {precision_score(y_test, y_pred)}")
print(f"Random Forest - Recall: {recall_score(y_test, y_pred)}")
print(f"Random Forest - F1 Score: {f1_score(y_test, y_pred)}")
print(f"Random Forest - Cross-Validation Accuracy: {cv_scores.mean()}")

NameError: name 'cross_val_score' is not defined

# GradientBoostingClassifier MODELING


In [None]:
# Create and train the model
gbm_model = GradientBoostingClassifier(random_state=42)
gbm_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = gbm_model.predict(X_test_ss_scaled)
cv_scores = cross_val_score(gbm_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')
# Score the model
print(f"Gradient Boosting Machine - Training Data Score: {gbm_model.score(X_train_ss_scaled, y_train)}")
print(f"Gradient Boosting Machine - Testing Data Score: {gbm_model.score(X_test_ss_scaled, y_test)}")
print(f"Gradient Boosting Machine - Precision: {precision_score(y_test, y_pred)}")
print(f"Gradient Boosting Machine - Recall: {recall_score(y_test, y_pred)}")
print(f"Gradient Boosting Machine - F1 Score: {f1_score(y_test, y_pred)}")
print(f"Gradient Boosting Machine - Cross-Validation Accuracy: {cv_scores.mean()}")


# KNeighborsClassifier

In [None]:


# Create and train the model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = knn_model.predict(X_test_ss_scaled)
cv_scores = cross_val_score(knn_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')
# Score the model
print(f"K-Nearest Neighbors - Training Data Score: {knn_model.score(X_train_ss_scaled, y_train)}")
print(f"K-Nearest Neighbors - Testing Data Score: {knn_model.score(X_test_ss_scaled, y_test)}")
print(f"K-Nearest Neighbors - Precision: {precision_score(y_test, y_pred)}")
print(f"K-Nearest Neighbors - Recall: {recall_score(y_test, y_pred)}")
print(f"K-Nearest Neighbors - F1 Score: {f1_score(y_test, y_pred)}")
print(f"K-Nearest Neighbors - Cross-Validation Accuracy: {cv_scores.mean()}")

# SVC (Support Vector Machine) Model

In [None]:

# Create and train the model
svm_model = SVC()
svm_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = svm_model.predict(X_test_ss_scaled)
# Cross-validation scores
cv_scores = cross_val_score(svm_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')

# Score the model
print(f"Support Vector Machine - Training Data Score: {svm_model.score(X_train_ss_scaled, y_train)}")
print(f"Support Vector Machine - Testing Data Score: {svm_model.score(X_test_ss_scaled, y_test)}")
print(f"Support Vector Machine - Precision: {precision_score(y_test, y_pred)}")
print(f"Support Vector Machine - Recall: {recall_score(y_test, y_pred)}")
print(f"Support Vector Machine - F1 Score: {f1_score(y_test, y_pred)}")
print(f"Support Vector Machine - Cross-Validation Accuracy: {cv_scores.mean()}")


# LogisticRegression Model

In [None]:
# Create and train the model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = logistic_regression_model.predict(X_test_ss_scaled)
# Cross-validation scores
cv_scores = cross_val_score(logistic_regression_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')

# Score the model
print(f"Logistic Regression - Training Data Score: {logistic_regression_model.score(X_train_ss_scaled, y_train)}")
print(f"Logistic Regression - Testing Data Score: {logistic_regression_model.score(X_test_ss_scaled, y_test)}")
print(f"Logistic Regression - Precision: {precision_score(y_test, y_pred)}")
print(f"Logistic Regression - Recall: {recall_score(y_test, y_pred)}")
print(f"Logistic Regression - F1 Score: {f1_score(y_test, y_pred)}")
print(f"Logistic Regression - Cross-Validation Accuracy: {cv_scores.mean()}")

# Decision Tree Model


In [None]:
# Create and train the model
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = decision_tree_model.predict(X_test_ss_scaled)
# Cross-validation scores
cv_scores = cross_val_score(decision_tree_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')
# Score the model
print(f"Decision Tree - Training Data Score: {decision_tree_model.score(X_train_ss_scaled, y_train)}")
print(f"Decision Tree - Testing Data Score: {decision_tree_model.score(X_test_ss_scaled, y_test)}")
print(f"Decision Tree - Precision: {precision_score(y_test, y_pred)}")
print(f"Decision Tree - Recall: {recall_score(y_test, y_pred)}")
print(f"Decision Tree - F1 Score: {f1_score(y_test, y_pred)}")
print(f"Decision Tree - Cross-Validation Accuracy: {cv_scores.mean()}")