# Install and Set Up Kaggle and API Key

Follow these steps to install and configure the Kaggle API on your system:

1. **Create a Kaggle Account**
   - Visit [Kaggle](https://www.kaggle.com) and sign up for an account.

2. **Obtain Kaggle API Key**
   - Go to your Kaggle account settings.
   - Find the "API" section and click on "Create New API Token".
   - This will download a `kaggle.json` file containing your API key.

3. **Install Kaggle Package**
   - Use Conda to install the Kaggle package by running:
     ```bash
     conda install kaggle
     ```

4. **Configure API Key**
   - Copy the `kaggle.json` file to your user directory under the `.kaggle` folder. On most systems, you can use the following command:
     ```bash
     mkdir -p ~/.kaggle
     cp path_to_downloaded_kaggle.json ~/.kaggle/kaggle.json
     chmod 600 ~/.kaggle/kaggle.json
     ```
   - Ensure the `.kaggle` directory and the `kaggle.json` file have the proper permissions by setting:
     ```bash
     chmod 600 ~/.kaggle/kaggle.json
     ```


In [1]:
import pandas as pd
import kaggle
# Pre processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
import numpy as np


# Scoring 
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
# models 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
#potting
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.decomposition import PCA




In [2]:
# Get the data using an API call
kaggle.api.dataset_download_files('rodsaldanha/arketing-campaign', path='resources', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/rodsaldanha/arketing-campaign


In [3]:
# Import the data
data = pd.read_csv("./resources/marketing_campaign.csv",delimiter=';')


# EDA (Exploratory Data Analysis)
We will revisit this. For now We want the rough draft of the model
#
During EDA

Visualize the data using plots and graphs to understand distributions and relationships between variables.
Calculate summary statistics to get a sense of the central tendencies and variability.
Identify any correlations between variables that might influence model choices.
Detect and treat missing values or outliers that could skew the results of your analysis.
Explore the data's structure to inform feature selection and engineering, which are key to building effective machine learning models.

# read any and all documentation you can find on your dataset to understand it better


In [4]:
display (data.head())
# what does our data look like? At this point also use any documentation on the data set to find out what each value means and how it might be used is solving the business problem
display (data.shape)
print (f'Columns with NA valuses \n {data.isna().sum()[lambda x: x > 0]}')
# Make desision about null values. Can we fill them of should we drop rows with null values?
non_numeric= (data.dtypes[(data.dtypes != 'int64') & (data.dtypes != 'float64')]).index.tolist()
# display (data.dtypes)
print (f'Columns that are not numeric :\n {non_numeric}')
# Explore non numberic type to see how we can use them in the model

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,5,0,0,0,0,0,0,3,11,0


(2240, 29)

Columns with NA valuses 
 Income    24
dtype: int64
Columns that are not numeric :
 ['Education', 'Marital_Status', 'Dt_Customer']


In [5]:
# Split the data into training and test sets with an 80/20 split
X = data.drop('Response', axis=1)
y = data["Response"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Define the generational labels and ranges
def assign_generation(year):
    if year <= 1927:
        return 'Greatest Generation'
    elif 1928 <= year <= 1945:
        return 'Silent Generation'
    elif 1946 <= year <= 1964:
        return 'Baby Boomers'
    elif 1965 <= year <= 1980:
        return 'Generation X'
    elif 1981 <= year <= 1996:
        return 'Millennials (Gen Y)'
    elif 1997 <= year <= 2012:
        return 'Generation Z (Gen Z)'
    else:
        return 'Generation Alpha'

# Apply the function to create a new column 'Generation'
X_train['Generation'] = X_train['Year_Birth'].apply(assign_generation)
X_test['Generation'] = X_test['Year_Birth'].apply(assign_generation)

# Calculate Months as Customer
X_train['Dt_Customer'] = pd.to_datetime(X_train['Dt_Customer'])
X_test['Dt_Customer'] = pd.to_datetime(X_test['Dt_Customer'])
current_date = datetime(2014, 6, 29)
X_train['Months_Customer'] = ((current_date.year - X_train['Dt_Customer'].dt.year) * 12 +
                              (current_date.month - X_train['Dt_Customer'].dt.month))
X_test['Months_Customer'] = ((current_date.year - X_test['Dt_Customer'].dt.year) * 12 +
                             (current_date.month - X_test['Dt_Customer'].dt.month))
X_train = X_train.drop('Dt_Customer', axis=1)
X_test = X_test.drop('Dt_Customer', axis=1)

# Drop unnecessary columns and handle NA
X_train = X_train.drop(columns=["ID", "Year_Birth"]).dropna()
X_test = X_test.drop(columns=["ID", "Year_Birth"]).dropna()

# Create encoders for the categorical variables
Mar_ord_enc = OrdinalEncoder(categories=[['Married', 'Together', 'Single', 'Divorced', 'Widow', "YOLO", "Absurd", "Alone"]])
edu_ord_enc = OrdinalEncoder(categories=[['Basic', '2n Cycle', 'Graduation', 'Master', 'PhD']])
Age_ord_enc = OrdinalEncoder(categories=[['Greatest Generation', 'Silent Generation', 'Baby Boomers', 'Generation X', 'Millennials (Gen Y)', 'Generation Z (Gen Z)', 'Generation Alpha']])

# Train the encoders on the training data
Mar_ord_enc.fit(X_train['Marital_Status'].values.reshape(-1, 1))
edu_ord_enc.fit(X_train['Education'].values.reshape(-1, 1))
Age_ord_enc.fit(X_train['Generation'].values.reshape(-1, 1))

# Preprocessing function
def X_preprocess(X_data):
    # Ensure the data is in the correct format
    X_data = X_data.copy()
    
    # Transform each column into numpy arrays
    marital_status_encoded = Mar_ord_enc.transform(X_data['Marital_Status'].values.reshape(-1, 1))
    education_encoded = edu_ord_enc.transform(X_data['Education'].values.reshape(-1, 1))
    generation_encoded = Age_ord_enc.transform(X_data['Generation'].values.reshape(-1, 1))
    
    # Reorganize the numpy arrays into DataFrames
    marital_status_encoded_df = pd.DataFrame(marital_status_encoded, columns=Mar_ord_enc.get_feature_names_out(['Marital_Status']))
    education_encoded_df = pd.DataFrame(education_encoded, columns=edu_ord_enc.get_feature_names_out(['Education']))
    generation_encoded_df = pd.DataFrame(generation_encoded, columns=Age_ord_enc.get_feature_names_out(['Generation']))
    
    # Drop the original categorical columns from the original DataFrame
    X_data = X_data.drop(['Education', 'Marital_Status', 'Generation'], axis=1)
    
    # Combine the encoded columns with the remaining DataFrame
    out_df = pd.concat([X_data.reset_index(drop=True), marital_status_encoded_df.reset_index(drop=True), 
                        education_encoded_df.reset_index(drop=True), generation_encoded_df.reset_index(drop=True)], axis=1)
    
    # Return the DataFrame
    return out_df

# Preprocess the training and test data
X_train_proc = X_preprocess(X_train)
X_test_proc = X_preprocess(X_test)

# Check if preprocessing was successful
print("Processed Training Data:")
display(X_train_proc.head())
print("Processed Test Data:")
display(X_test_proc.head())



Processed Training Data:


Unnamed: 0,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,...,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Months_Customer,Marital_Status,Education,Generation
0,16813.0,0,0,49,4,8,11,12,2,13,...,0,0,0,0,3,11,11,2.0,2.0,2.0
1,64191.0,0,1,30,420,15,186,151,38,15,...,0,0,0,0,3,11,17,2.0,3.0,2.0
2,71969.0,0,1,59,1000,0,76,0,0,10,...,0,0,0,0,3,11,20,0.0,4.0,3.0
3,29187.0,1,0,43,26,0,6,0,0,2,...,0,0,0,0,3,11,13,0.0,4.0,3.0
4,4428.0,0,1,0,16,4,12,2,4,321,...,0,0,0,0,3,11,8,0.0,2.0,3.0


Processed Test Data:


Unnamed: 0,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,...,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Months_Customer,Marital_Status,Education,Generation
0,40464.0,0,1,78,424,17,118,7,23,41,...,0,0,0,0,3,11,17,2.0,2.0,2.0
1,47916.0,0,1,72,505,0,26,0,0,75,...,0,0,0,0,3,11,19,4.0,2.0,2.0
2,14188.0,0,0,40,2,7,11,16,12,27,...,0,0,0,0,3,11,16,0.0,0.0,3.0
3,76653.0,0,0,91,736,63,946,219,189,126,...,1,1,0,0,3,11,10,1.0,2.0,3.0
4,65196.0,0,2,34,743,19,181,12,0,200,...,0,0,0,0,3,11,11,1.0,2.0,2.0


In [6]:
pd.set_option('display.max_rows', 500)


In [7]:
# Initialize PCA with 10 components
pca_model = PCA(n_components=10)

# Fit PCA on the training data
pca_model.fit(X_train_proc)

# Transform the training and testing data
X_train_pca = pd.DataFrame(pca_model.transform(X_train_proc))
X_test_pca = pd.DataFrame(pca_model.transform(X_test_proc))

# Display the first few rows of the transformed data
print("PCA Transformed Training Data:")
display(X_train_pca.head())
print("PCA Transformed Test Data:")
display(X_test_pca.head())

PCA Transformed Training Data:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-35152.186917,31.597319,32.848204,1.789405,-7.501221,-2.829838,0.427663,-4.07257,-0.160827,0.990543
1,12227.965632,-21.84995,-43.830221,-41.415119,-77.324019,-71.377419,6.765761,20.530422,6.455263,-3.845308
2,20009.083875,-443.535123,-395.02611,80.324073,-14.25515,-5.520105,-10.089668,-6.703858,7.934384,1.842306
3,-22778.558987,120.312846,-8.173473,23.816314,-7.760422,-2.036845,5.244189,0.392524,2.62192,1.779452
4,-47536.427489,-101.570106,62.688477,-176.968679,243.027108,-1.224416,39.789909,34.59134,-8.969569,-8.244355


PCA Transformed Test Data:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-11498.543209,-197.129972,-72.099263,18.211767,8.242871,16.639759,-27.646249,-3.707407,5.346095,-5.128432
1,-4046.721436,-175.29766,-204.750293,7.911844,43.281052,-1.192585,-24.563606,-2.758675,6.544814,-1.571801
2,-37777.080684,9.742819,40.474342,-13.064699,-1.202207,-0.1392,7.514445,6.226917,4.299542,1.161635
3,24695.770357,-500.535235,533.81106,-84.078729,-40.540799,-3.833161,-59.921675,79.003943,-4.684441,1.088351
4,13235.179062,-312.482269,-181.23887,-41.665355,139.592048,-7.725662,12.404966,1.328137,-3.806295,-1.398428


In [8]:
from imblearn.over_sampling import RandomOverSampler

# Applying RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Check the new class distribution
print(y_resampled.value_counts())



Response
1    1906
0    1906
Name: count, dtype: int64


In [9]:
from imblearn.under_sampling import RandomUnderSampler

# Applying RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Check the new class distribution
print(y_resampled.value_counts())

Response
0    334
1    334
Name: count, dtype: int64


In [10]:
from imblearn.over_sampling import SMOTE

# Applying SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the new class distribution
print(y_resampled.value_counts())

ValueError: could not convert string to float: 'Graduation'

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split the original data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Train a RandomForestClassifier with class weights
clf = RandomForestClassifier(class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from imblearn.combine import SMOTEENN

# Applying SMOTEENN (combination of SMOTE and Edited Nearest Neighbors)
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# Split the resampled data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=42)

# Train and evaluate the model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# Scale the X data by using StandardScaler()
scaler_ss = StandardScaler().fit(X_train)
X_train_ss_scaled = scaler_ss.transform(X_train)
display (X_train_ss_scaled)

# Transform the test dataset based on the fit from the training dataset
X_test_ss_scaled = scaler_ss.transform(X_test)
display (X_test_ss_scaled)

In [None]:
# now lets look at min max scaler
scaler_mm = MinMaxScaler().fit(X_train)
X_train_mm_scaled = scaler_mm.transform(X_train)
display (X_train_mm_scaled)
#
X_test_mm_scaled = scaler_mm.transform(X_test)
display (X_test_mm_scaled)

X_test_mm_scaled = scaler_mm.transform(X_test)
display (X_test_mm_scaled)

In [None]:
# Use Logistic model to find out what scaler works best

# Create a `LogisticRegression` function and assign it 
# to a variable named `logistic_regression_model`.
logistic_regression_model_ss = LogisticRegression()
logistic_regression_model_ss.fit(X_train_ss_scaled, y_train)
#
logistic_regression_model_mm = LogisticRegression()
logistic_regression_model_mm.fit(X_train_mm_scaled, y_train)
# Score the Logistic model

print(f"Standard Scaler\nTraining Data Score: {logistic_regression_model_ss.score(X_train_ss_scaled, y_train)}")
print(f"Testing Data Score: {logistic_regression_model_ss.score(X_test_ss_scaled, y_test)}")
print(f"Min Max Scaler\nTraining Data Score: {logistic_regression_model_mm.score(X_train_mm_scaled, y_train)}")
print(f"Testing Data Score: {logistic_regression_model_mm.score(X_test_mm_scaled, y_test)}")

# Test models
    -RANDOM FOREST MODEL
    -GradientBoostingClassifier
    -KNeighborsClassifier
    -SVM (Support Vector Machine)
    -LogisticRegression
    -Decision Tree Model

# **RANDOM FOREST MODEL


In [None]:
# Create and train the model
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train, y_train)
# Predict on test set
y_pred = random_forest_model.predict(X_test)
# Calculate precision, recall, F1 score
# Cross-validation scores
cv_scores = cross_val_score(random_forest_model, X_train, y_train, cv=5, scoring='accuracy')
display (random_forest_model)


# Create and train the model
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = random_forest_model.predict(X_test_ss_scaled)
# Calculate precision, recall, F1 score
# Cross-validation scores
cv_scores = cross_val_score(random_forest_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')
display (random_forest_model)


In [None]:
# Create and train the model
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = random_forest_model.predict(X_test_ss_scaled)
# Calculate precision, recall, F1 score
# Cross-validation scores
cv_scores = cross_val_score(random_forest_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')
display (random_forest_model)


# GradientBoostingClassifier MODELING


# Create and train the model
gbm_model = GradientBoostingClassifier(random_state=42)
gbm_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = gbm_model.predict(X_test_ss_scaled)
cv_scores = cross_val_score(gbm_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')


# KNeighborsClassifier



# Create and train the model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = knn_model.predict(X_test_ss_scaled)
cv_scores = cross_val_score(knn_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')


# SVC (Support Vector Machine) Model


# Create and train the model
svm_model = SVC()
svm_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = svm_model.predict(X_test_ss_scaled)
# Cross-validation scores
cv_scores = cross_val_score(svm_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')



# LogisticRegression Model

# Create and train the model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = logistic_regression_model.predict(X_test_ss_scaled)
# Cross-validation scores
cv_scores = cross_val_score(logistic_regression_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')



# Decision Tree Model


In [None]:
# Create and train the model
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train_ss_scaled, y_train)
# Predict on test set
y_pred = decision_tree_model.predict(X_test_ss_scaled)
# Cross-validation scores
cv_scores = cross_val_score(decision_tree_model, X_train_ss_scaled, y_train, cv=5, scoring='accuracy')


In [None]:
# Test models
# -RANDOM FOREST MODEL
# Score the model
print(f"Random Forest - Training Data Score: {random_forest_model.score(X_train_ss_scaled, y_train)}")
print(f"Random Forest - Testing Data Score: {random_forest_model.score(X_test_ss_scaled, y_test)}")
print(f"Random Forest - Precision: {precision_score(y_test, y_pred)}")
print(f"Random Forest - Recall: {recall_score(y_test, y_pred)}")
print(f"Random Forest - F1 Score: {f1_score(y_test, y_pred)}")
print(f"Random Forest - Cross-Validation Accuracy: {cv_scores.mean()}")


# -GradientBoostingClassifier
# Score the model
print(f"Gradient Boosting Machine - Training Data Score: {gbm_model.score(X_train_ss_scaled, y_train)}")
print(f"Gradient Boosting Machine - Testing Data Score: {gbm_model.score(X_test_ss_scaled, y_test)}")
print(f"Gradient Boosting Machine - Precision: {precision_score(y_test, y_pred)}")
print(f"Gradient Boosting Machine - Recall: {recall_score(y_test, y_pred)}")
print(f"Gradient Boosting Machine - F1 Score: {f1_score(y_test, y_pred)}")
print(f"Gradient Boosting Machine - Cross-Validation Accuracy: {cv_scores.mean()}")
# -KNeighborsClassifier
# Score the model
print(f"K-Nearest Neighbors - Training Data Score: {knn_model.score(X_train_ss_scaled, y_train)}")
print(f"K-Nearest Neighbors - Testing Data Score: {knn_model.score(X_test_ss_scaled, y_test)}")
print(f"K-Nearest Neighbors - Precision: {precision_score(y_test, y_pred)}")
print(f"K-Nearest Neighbors - Recall: {recall_score(y_test, y_pred)}")
print(f"K-Nearest Neighbors - F1 Score: {f1_score(y_test, y_pred)}")
print(f"K-Nearest Neighbors - Cross-Validation Accuracy: {cv_scores.mean()}")
# -SVM (Support Vector Machine)
# Score the model
print(f"Support Vector Machine - Training Data Score: {svm_model.score(X_train_ss_scaled, y_train)}")
print(f"Support Vector Machine - Testing Data Score: {svm_model.score(X_test_ss_scaled, y_test)}")
print(f"Support Vector Machine - Precision: {precision_score(y_test, y_pred)}")
print(f"Support Vector Machine - Recall: {recall_score(y_test, y_pred)}")
print(f"Support Vector Machine - F1 Score: {f1_score(y_test, y_pred)}")
print(f"Support Vector Machine - Cross-Validation Accuracy: {cv_scores.mean()}")
# -LogisticRegression
# Score the model
print(f"Logistic Regression - Training Data Score: {logistic_regression_model.score(X_train_ss_scaled, y_train)}")
print(f"Logistic Regression - Testing Data Score: {logistic_regression_model.score(X_test_ss_scaled, y_test)}")
print(f"Logistic Regression - Precision: {precision_score(y_test, y_pred)}")
print(f"Logistic Regression - Recall: {recall_score(y_test, y_pred)}")
print(f"Logistic Regression - F1 Score: {f1_score(y_test, y_pred)}")
print(f"Logistic Regression - Cross-Validation Accuracy: {cv_scores.mean()}")
# -Decision Tree Model
# Score the model
print(f"Decision Tree - Training Data Score: {decision_tree_model.score(X_train_ss_scaled, y_train)}")
print(f"Decision Tree - Testing Data Score: {decision_tree_model.score(X_test_ss_scaled, y_test)}")
print(f"Decision Tree - Precision: {precision_score(y_test, y_pred)}")
print(f"Decision Tree - Recall: {recall_score(y_test, y_pred)}")
print(f"Decision Tree - F1 Score: {f1_score(y_test, y_pred)}")
print(f"Decision Tree - Cross-Validation Accuracy: {cv_scores.mean()}")

To clearly plot and interpret the results of a model using a RandomForestClassifier to predict customer response based on a list of features, you can follow these steps in your Jupyter Notebook:

Fit the Model: Train the RandomForestClassifier on your training data.
Make Predictions: Use the model to predict responses on the test set.
Evaluate the Model: Calculate key performance metrics like accuracy, precision, recall, and F1-score.
Feature Importance: Extract and plot the feature importances to understand which features are most influential in predicting customer responses.
Visualize Predictions: Create visualizations to display the predicted responses against actual responses.

Feature Importance: 
Extract and plot the feature importances to understand which features are most influential in predicting customer responses.

In [None]:
# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Detailed classification report
print(classification_report(y_test, y_pred))

In [None]:


features = X_train.columns
importances = random_forest_model.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(10, 6))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()


In [None]:
# # Compare actual and predicted responses
# comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# # Sample some data to plot
# sampled_data = comparison_df.sample(50, random_state=42)
# sampled_data.plot(kind='bar', figsize=(14, 8))
# plt.title('Comparison of Actual and Predicted Responses')
# plt.show()


In [None]:
y_test = np.squeeze(y_test)
y_pred = np.squeeze(y_pred)

# Compare actual and predicted responses
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Sample some data to plot
sampled_data = comparison_df.sample(50, random_state=42)
sampled_data.plot(kind='bar', figsize=(14, 8))
plt.title('Comparison of Actual and Predicted Responses')
plt.show()

In [None]:
from collections import Counter
from sklearn.datasets import make_classification
from matplotlib import pyplot
from numpy import where
# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
 n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
# summarize class distribution
counter = Counter(y)
print(counter)
# scatter plot of examples by class label
for label, _ in counter.items():
 row_ix = where(y == label)[0]
 pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label))
pyplot.legend()
pyplot.show()

In [None]:
# Scatter plot for random forest
# 

import matplotlib.pyplot as plt
import numpy as np

features = X_train.columns
importances = model.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(10, 6))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
