In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Notebook done by Abhishek, Nikhil, Vishal

### Given data - Columns
###### customer_id - a unique customer identification number
###### age - the age of the user
###### location - the US state of the user
###### subscription_type - type of subsciption
###### payment_plan - how often the user pays, monthly of annually
###### num_subscription_pauses - number of times the user has paused their subscription (max 2)
###### payment_method - form of user payment
###### customer_service_inquiries - the frequency of customer service inquiries from the user
###### signup_date - date the user signed up for the music subscription service
###### weekly_hours - average number of weekly listening hours
###### average_session_length - average length of each music listening session (in hours)
###### song_skip_rate - percentage of songs the user does not finish
###### weekly_songs_played - average number of songs the user plays in a week
###### weekly_unique_songs - average number of unique songs the user plays in a week
###### num_favorite_artists - number of artists the user set as favorite artists
###### num_platform_friends - number of user connections in the app
###### num_playlists_created - number of playlists the user created
###### num_shared_playlists - number of playlists that are shared publicly
###### notifications_clicked - number of in-app notifications clicked on
###### churned - this is the target variable, 0 = customer is active, 1 = customer churned

In [None]:
#importing csv files
trainDF = pd.read_csv("/kaggle/input/music-subscriptions-churn-predictor/train.csv")
testDF = pd.read_csv("/kaggle/input/music-subscriptions-churn-predictor/test.csv")

In [None]:
trainDF.head() #viewing the data

In [None]:
trainDF.info()  #checking for nulls and checking the data types

In [None]:
objects = ['location','subscription_type', 'payment_plan','payment_method','customer_service_inquiries'] #created a list of object types

In [None]:
for i in objects:
    print(trainDF[i].unique()) #viewing the unique values in object columns

In [None]:
trainDF.describe()   #checking for any discrapancies in the distribution

Changing datetype to numerical columns

In [None]:
year = []
month = []
day = []

for i in list(trainDF['signup_date']):
# Creating a Timestamp object
    timestamp = pd.Timestamp(i)

# Extracting the year from the Timestamp
    year.append(timestamp.year)

# Extracting the month from the Timestamp
    month.append(timestamp.month)

# Extracting the day from the Timestamp
    day.append(timestamp.day)

In [None]:
trainDF['year'] = year
trainDF['month'] = month
trainDF['day'] = day

Dropping unecessary columns

In [None]:
trainDF.drop('signup_date',axis =1,inplace = True)

In [None]:
trainDF.drop('customer_id',axis =1,inplace = True)

#### Using Label Encloder since it reduces the no. of columns used in the dataset

In [None]:
from sklearn.preprocessing import LabelEncoder
for i in objects:
    le = LabelEncoder()
    le.fit(trainDF[i])
    trainDF[i] = le.transform(trainDF[i])
    testDF[i] = le.transform(testDF[i])

In [None]:
trainDF.info() #checking

### Data Profiling

In [None]:
# create a data profiling function
def create_data_profiling_df(data: pd.DataFrame) -> pd.DataFrame:

    # create an empty dataframe to gather information about each column
    data_profiling_df = pd.DataFrame(columns = ["column_name",
                                                "column_type",
                                                "unique_values",
                                                "duplicate_values",
                                                "null_values",
                                                "max",
                                                "min",
                                                "range",
                                                "IQR"])

    # loop through each column to add rows to the data_profiling_df dataframe
    for column in data.columns:

        # create an empty dictionary to store the columns data
        column_dict = {}

        try:
            column_dict["column_name"] = [column]
            column_dict["column_type"] = [data[column].dtypes]
            column_dict["unique_values"] = [len(data[column].unique())]
            column_dict["duplicate_values"] = [(data[column].shape[0] - data[column].isna().sum()) - len(data[column].unique())]
            column_dict["null_values"] = [data[column].isna().sum()]
            column_dict["max"] = [data[column].max() if (data[column].dtypes != object) else "NA"]
            column_dict["min"] = [data[column].min() if (data[column].dtypes != object) else "NA"]
            column_dict["range"] = [data[column].max() - data[column].min() if (data[column].dtypes != object) else "NA"]
            column_dict["IQR"] = [data[column].quantile(.75) - data[column].quantile(.25) if (data[column].dtypes != object) else "NA"]

        except:
            print(f"unable to read column: {column}, you may want to drop this column")

        # add the information from the columns dict to the final dataframe
        data_profiling_df = pd.concat([data_profiling_df, pd.DataFrame(column_dict)],
                                      ignore_index = True)

    # sort the final dataframe by unique values descending
    data_profiling_df.sort_values(by = ['unique_values'],
                                  ascending = [False],
                                  inplace=True)

    # print the function is complete
    print(f"data profiling complete, dataframe contains {len(data_profiling_df)} columns")
    return data_profiling_df

In [None]:
# run the data profiling function and print the dataframe
data_profiling_df = create_data_profiling_df(data = trainDF)
data_profiling_df


## Checking for outliers in the data

In [None]:
from matplotlib import pyplot as plt
plt.boxplot(trainDF['notifications_clicked'])
plt.show()

In [None]:
from matplotlib import pyplot as plt
plt.boxplot(trainDF['weekly_unique_songs'])
plt.show()

In [None]:
from matplotlib import pyplot as plt
plt.boxplot(trainDF['num_shared_playlists'])
plt.show()

In [None]:
#dropping any duplicate columns in training dataset
trainDF.drop_duplicates(inplace = True)

In [None]:
#creating target variable and training set
y = trainDF['churned']
trainDF.drop('churned',axis = 1, inplace = True)
X = trainDF

In [None]:
X.info()

In [None]:
y.info()

### Feature Engineering

In [None]:
# let's use a correlation coefficient to determine which features to filter out
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# create correlation matrix
corr_matrix = X.corr().abs()

# the upper triangle of correlation matrix
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# plot the heatmap of the upper triangle
plt.figure(figsize=(8, 6))
sns.heatmap(upper_triangle, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Heatmap of Features')
plt.show()

In [None]:
# function to drop highly correlated features
def find_highly_correlated_features(X: pd.DataFrame,
                                    threshold=0.8) -> pd.DataFrame:

    # create a  correlation matrix
    corr_matrix = X.corr().abs()

    # select the upper triangle of correlation matrix
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # find features with correlation greater than the threshold
    features_to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]

    # print and return the features to drop
    print(f"features dropped: {features_to_drop}")
    return features_to_drop

In [None]:
features_to_drop = find_highly_correlated_features(X = X, threshold = 0.7)

In [None]:
X.drop(columns=features_to_drop,axis =1 , inplace = True)

In [None]:
X.info() #checking

In [None]:
y.info()

### Data Preparation

In [None]:
# split X and y into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

### Normalization

#### using robust Scaler since it is better suited for the outliers found earlier

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import RobustScaler

# fit and transform the scaler on the features
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)#_feature_selection)
X_test = scaler.transform(X_test)#_feature_selection)

# fit and transform the scaler on the features
#scaler = MinMaxScaler()
#X_train = scaler.fit_transform(X_train)#_feature_selection)
#X_test = scaler.transform(X_test)#_feature_selection)

# standardize the features
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)#_feature_selection)
#X_test = scaler.transform(X_test)#_feature_selection)

# apply PCA to reduce dimensionality
#pca = PCA(n_components=2)
#X_train = pca.fit_transform(X_train)
#X_test = pca.transform(X_test)

# apply LDA to reduce dimensionality
#lda = LinearDiscriminantAnalysis(n_components=1)
#X_train = lda.fit_transform(X_train, y_train)
#X_test = lda.transform(X_test)

## Creating Classification Models

### Gradient Boosting Classifier

In [None]:
# import libraries
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
import time

# Create a Gradient Boosting Classifier
start = time.time()
clf = GradientBoostingClassifier(random_state=55)

# Train the classifier on the training set
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model with a classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("No. of 1's predicted: ",np.count_nonzero(y_pred == 1))
# print total time
end = time.time()
print(f"completed in {round(end-start, 2)} seconds")

### Decision Tree

In [None]:
# import libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import time

# Create a Decision Tree classifier
start = time.time()
clf = DecisionTreeClassifier(random_state=55)

# Train the classifier on the training set
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model with a classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("No. of 1's predicted: ",np.count_nonzero(y_pred == 1))

# print total time
end = time.time()
print(f"completed in {round(end-start, 2)} seconds")

### SVM

In [None]:
# import libraries
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import time

# Create a SVM classifier
start = time.time()
clf = SVC(random_state=55)

# Train the classifier on the training set
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model with a classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("No. of 1's predicted: ",np.count_nonzero(y_pred == 1))
# print total time
end = time.time()
print(f"completed in {round(end-start, 2)} seconds")

### Random Forests Classifier

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import time
# Instantiate model
# Create a Decision Tree classifier
start = time.time()
clf =  RandomForestClassifier(n_estimators= 800, random_state=0)
# Train the model on training data
# Train the classifier on the training set
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model with a classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("No. of 1's predicted: ",np.count_nonzero(y_pred == 1))
# print total time
end = time.time()
print(f"completed in {round(end-start, 2)} seconds")

###### Gradient boosting classifier seems to do better with data imbalances

### Preparing test data

In [None]:
testDF.info()

In [None]:
year = []
month = []
day = []

for i in list(testDF['signup_date']):
# Creating a Timestamp object
    timestamp = pd.Timestamp(i)

# Extracting the year from the Timestamp
    year.append(timestamp.year)

# Extracting the month from the Timestamp
    month.append(timestamp.month)

# Extracting the day from the Timestamp
    day.append(timestamp.day)

testDF['year'] = year
testDF['month'] = month
testDF['day'] = day

idSet = testDF['customer_id']
testDF.drop('signup_date',axis =1,inplace = True)
testDF.drop('customer_id',axis =1,inplace = True)
testDF.drop(columns=features_to_drop,axis =1, inplace = True)

In [None]:
idSet = pd.DataFrame(idSet, columns = ['customer_id'])

In [None]:
testDF.info()

In [None]:
testDF = scaler.transform(testDF)

### Making Predictions

In [None]:
# import libraries
from sklearn.ensemble import GradientBoostingClassifier
import time

# Create a Gradient Boosting Classifier
start = time.time()
clf = GradientBoostingClassifier(random_state=55)

# Train the classifier on the training set
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(testDF)

# print total time
end = time.time()
print("No. of 1's predicted: ",np.count_nonzero(y_pred == 1))
print(f"completed in {round(end-start, 2)} seconds")

In [None]:
y_pred = pd.DataFrame(y_pred, columns = ['churned'])

In [None]:
output = pd.concat([idSet, y_pred],axis =1)

In [None]:
output

In [None]:
output.to_csv("submission.csv", index = False)

## With your model, can we reliably predict which customers will churn?
### With a good training accuracy of 90 percent and good precision and recall, we are pretty sure that it will churn the customers good, and not make lose to the company.

## How does your model work?
### Our model is based on Gradient boosting, since it is the best model from my test results in predicting imbalaances in the data. Other models falsely predict the churn or just give 0's as output.

## From the 2,500 customers in the testing data (test.csv), what percentage of customers do you expect we will retain?
### We expect we can retain about 90-92% of the customers, that is nearly 2300 customers. This metric is obtained from the accuracy and precision of 90% of the gradient boosting classifier model