In [1]:

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel


import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem.porter import *

# Import libraries for feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics
from sklearn.model_selection import cross_val_score

#Train-Test Split
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")


from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Mounted at /gdrive
/gdrive


In [2]:
#Upload csv data as dataframe
textfile = r'/gdrive/My Drive/Projects/SentimentAnalysis/Comments.csv'
textData = pd.read_csv(textfile) 

CustInfofile = r'/gdrive/My Drive/Projects/SentimentAnalysis/Customers.csv'
CustInfoData = pd.read_csv(CustInfofile)  

print(textData.shape)
print(CustInfoData.shape)


(2070, 2)
(2070, 17)


In [3]:
# Evaluating how balanced our customer info dataset is

CustInfoData['TARGET'].value_counts()

print('Percent current is', 1266/(1266+804))
print('Percent cancelled is', 804/(1266+804))

Percent current is 0.6115942028985507
Percent cancelled is 0.3884057971014493


In [4]:
#Separate target column from Customer Info file
y_train = CustInfoData["TARGET"]
X_train = CustInfoData.drop(columns=["TARGET"]) #extracting training data without the target column

In [5]:
#Use tokenize to split the sentences to lists of words
textData['CommentsTokenized'] = textData['Comments'].apply(word_tokenize)

export_csv = textData.to_csv(r'/gdrive/My Drive/Projects/SentimentAnalysis/TextDataTokenized1.csv')

In [6]:
# Utilize porter stemmer
stemmer = PorterStemmer()

#Create a new dataframe to store stemmed version (stemming)
newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedStemmed'] = textData['CommentsTokenized'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

export_csv = newTextData.to_csv(r'/gdrive/My Drive/Projects/SentimentAnalysis/TextDataTokenizedStemmedPorter.csv')

#Information about the different stemmers for examplantion --> https://www.datacamp.com/community/tutorials/stemming-lemmatization-python


In [7]:
# Utilize snowball stemmer
stemmer = SnowballStemmer("english")

#Create a new dataframe to store stemmed version (stemming)
newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedStemmed'] = textData['CommentsTokenized'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

export_csv = newTextData.to_csv(r'/gdrive/My Drive/Projects/SentimentAnalysis/TextDataTokenizedStemmedSnowball.csv')


In [8]:

#Join stemmed strings
newTextData['CommentsTokenizedStemmed'] = newTextData['CommentsTokenizedStemmed'].apply(lambda x: " ".join(x))

export_csv = newTextData.to_csv(r'/gdrive/My Drive/Projects/SentimentAnalysis/NewTextDataJoined.csv')

In [9]:
#Construct Bag-Of-Words model - Term - Document Matrix and ELIMINATE STOP WORDS
#count_vect = CountVectorizer(stop_words=None)
count_vect = CountVectorizer(stop_words='english',lowercase=False)
TD_counts = count_vect.fit_transform(newTextData.CommentsTokenizedStemmed)
print(TD_counts.shape)
print(TD_counts.dtype)
print(count_vect.get_feature_names())
#print(TD_counts)
DF_TD_Counts=pd.DataFrame(TD_counts.toarray())
print(DF_TD_Counts)
export_csv = DF_TD_Counts.to_csv(r'/gdrive/My Drive/Projects/SentimentAnalysis/TD_counts-TokenizedStemmed.csv')

#This matrix is counting the instances of each token for each record in this dataset
#More information on bag of words model and the purposes behind vectorizing text --> https://machinelearningmastery.com/gentle-introduction-bag-words-model/


(2070, 354)
int64
['3399', '3g', 'abysm', 'access', 'accessori', 'adapt', 'add', 'addit', 'additon', 'address', 'adit', 'adress', 'advertis', 'afraid', 'alway', 'angel', 'angri', 'ani', 'anoth', 'anyth', 'anytim', 'area', 'asap', 'ask', 'bad', 'basic', 'bateri', 'batteri', 'becaus', 'believ', 'better', 'bigger', 'book', 'bought', 'brain', 'bring', 'built', 'busi', 'button', 'buy', 'cancel', 'cancer', 'car', 'care', 'carrier', 'caus', 'cc', 'cell', 'certain', 'chang', 'charg', 'charger', 'check', 'chip', 'citi', 'claim', 'cleariti', 'cold', 'comapr', 'compani', 'compar', 'competit', 'complain', 'complaint', 'concept', 'connect', 'consisit', 'consist', 'constan', 'contact', 'continu', 'contract', 'correct', 'cost', 'coupl', 'cover', 'coverag', 'creat', 'credit', 'cstmer', 'cstmr', 'current', 'cust', 'custom', 'customr', 'date', 'day', 'dead', 'decent', 'defect', 'deo', 'did', 'die', 'differ', 'difficult', 'digiti', 'direct', 'disabl', 'doe', 'don', 'dont', 'drop', 'dure', 'easier', 'effe

In [10]:
#Construct TF-IDF Matrix
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(TD_counts)
print(X_train_tfidf.shape)
DF_TF_IDF=pd.DataFrame(X_train_tfidf.toarray())
print(DF_TF_IDF)
export_csv= DF_TF_IDF.to_csv(r'/gdrive/My Drive/Projects/SentimentAnalysis/TFIDF_counts-TokenizedStemmed.csv')

#This matrix is a numerical statistic that reflects the importance of each word/token 


(2070, 354)
      0    1    2    3        4    5    ...  348  349  350  351  352  353
0     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
1     0.0  0.0  0.0  0.0  0.27568  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
3     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
4     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
...   ...  ...  ...  ...      ...  ...  ...  ...  ...  ...  ...  ...  ...
2065  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2066  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2067  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2068  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2069  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0

[2070 rows x 354 columns]


In [11]:
#Combine CustInfo Data with our TF_IDF matrix (feature selection has not occurred yet)
print(CustInfoData.shape)
X_train = CustInfoData.drop(columns=["TARGET"]) #extracting training data without the target column
print(X_train.shape)

combined=pd.concat([X_train, DF_TF_IDF], axis=1)
print(combined.shape)
print(combined)
export_csv= combined.to_csv(r'/gdrive/My Drive/Projects/SentimentAnalysis/Combined-Cust+TFIDF+SelectedFeatures.csv')

#By combining these two dataframes, we now have a more complete dataset

(2070, 17)
(2070, 16)
(2070, 370)
        ID Sex Status  Children  Est_Income  ...  349  350  351  352  353
0        1   F      S         1    38000.00  ...  0.0  0.0  0.0  0.0  0.0
1        6   M      M         2    29616.00  ...  0.0  0.0  0.0  0.0  0.0
2        8   M      M         0    19732.80  ...  0.0  0.0  0.0  0.0  0.0
3       11   M      S         2       96.33  ...  0.0  0.0  0.0  0.0  0.0
4       14   F      M         2    52004.80  ...  0.0  0.0  0.0  0.0  0.0
...    ...  ..    ...       ...         ...  ...  ...  ...  ...  ...  ...
2065  3821   F      S         0    78851.30  ...  0.0  0.0  0.0  0.0  0.0
2066  3822   F      S         1    17540.70  ...  0.0  0.0  0.0  0.0  0.0
2067  3823   F      M         0    83891.90  ...  0.0  0.0  0.0  0.0  0.0
2068  3824   F      M         2    28220.80  ...  0.0  0.0  0.0  0.0  0.0
2069  3825   F      S         0    28589.10  ...  0.0  0.0  0.0  0.0  0.0

[2070 rows x 370 columns]


In [12]:
#Performing the necessary OHE for categorical features in this new dataset
X_cat = ["Sex","Status","Car_Owner","Paymethod","LocalBilltype","LongDistanceBilltype"]
#X_cat = combined.select_dtypes(exclude=['int','float64'])
print(X_cat)
combined_one_hot = pd.get_dummies(combined,columns=X_cat)
print(combined_one_hot.shape)
export_csv= combined_one_hot.to_csv(r'/gdrive/My Drive/Projects/SentimentAnalysis/combined_one_hot.csv')

['Sex', 'Status', 'Car_Owner', 'Paymethod', 'LocalBilltype', 'LongDistanceBilltype']
(2070, 378)


Feature Selection - Filter Method (Testing different values for k with a Random Forest Classifier)

In [13]:
#In this cell, testing accuracy/cv scores for k = 10 features and Random Forest Classifier
#Feature selection on combined dataset (Chi-2 Square Scoring)

combined_SelectKBest_chi2 = SelectKBest(score_func=chi2, k=10).fit_transform(combined_one_hot,y_train)
print(combined_SelectKBest_chi2.shape)

combined_SelectedFeatures= pd.DataFrame(combined_SelectKBest_chi2)
print(combined_SelectedFeatures)

#This step cuts down the size of our matrix by selecting only a designated amount (k) of the best features from our combined dataset

#Perform Train-Test split on the selected feature data

X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_SelectKBest_chi2, y_train, test_size = .20, random_state = 1)

#Build random forest classifier on combined dataset (feature ranking)

rf =RandomForestClassifier()
rf_combined = rf.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(rf.score(X_Test, Y_Test)))
rf_predictions = rf.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, rf_predictions))
print("Classification Report")
print(classification_report(Y_Test, rf_predictions))

#run cross-validation - Random Forest Model
rf_combined_cv_score = cross_val_score(rf_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(rf_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",rf_combined_cv_score.mean())
print('\n')

(2070, 10)
           0    1         2       3      4     5       6    7    8    9
0        1.0  1.0  38000.00  229.64  23.56  0.00  206.08  0.0  0.0  1.0
1        6.0  2.0  29616.00   75.29  29.78  0.00   45.50  1.0  1.0  0.0
2        8.0  0.0  19732.80   47.25  24.81  0.00   22.44  1.0  1.0  0.0
3       11.0  2.0     96.33   59.01  26.13  0.00   32.88  1.0  0.0  1.0
4       14.0  2.0  52004.80   28.14   5.03  0.00   23.11  0.0  1.0  0.0
...      ...  ...       ...     ...    ...   ...     ...  ...  ...  ...
2065  3821.0  0.0  78851.30   29.04   0.37  0.00   28.66  0.0  0.0  1.0
2066  3822.0  1.0  17540.70   36.20  22.17  0.57   13.45  0.0  0.0  1.0
2067  3823.0  0.0  83891.90   74.40  28.92  0.00   45.47  0.0  1.0  0.0
2068  3824.0  2.0  28220.80   38.95  26.49  0.00   12.46  0.0  1.0  0.0
2069  3825.0  0.0  28589.10  100.28  13.19  0.00   87.09  0.0  0.0  1.0

[2070 rows x 10 columns]
Accuracy score: 0.852657
Confusion Matrix:
[[124  26]
 [ 35 229]]
Classification Report
           

In [14]:
#In this cell, testing accuracy/cv scores for k = 25 features and Random Forest Classifier
#Feature selection on combined dataset (Chi-2 Square Scoring)

combined_SelectKBest_chi2 = SelectKBest(score_func=chi2, k=25).fit_transform(combined_one_hot,y_train)
print(combined_SelectKBest_chi2.shape)

combined_SelectedFeatures= pd.DataFrame(combined_SelectKBest_chi2)
print(combined_SelectedFeatures)

#This step cuts down the size of our matrix by selecting only a designated amount (k) of the best features from our combined dataset

#Perform Train-Test split on the selected feature data

X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_SelectKBest_chi2, y_train, test_size = .20, random_state = 1)

#Build random forest classifier on combined dataset (feature ranking)

rf =RandomForestClassifier()
rf_combined = rf.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(rf.score(X_Test, Y_Test)))
rf_predictions = rf.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, rf_predictions))
print("Classification Report")
print(classification_report(Y_Test, rf_predictions))

#run cross-validation - Random Forest Model
rf_combined_cv_score = cross_val_score(rf_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(rf_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",rf_combined_cv_score.mean())
print('\n')

(2070, 25)
          0    1         2       3          4   ...   20   21   22   23   24
0        1.0  1.0  38000.00  229.64  24.393333  ...  1.0  0.0  1.0  1.0  0.0
1        6.0  2.0  29616.00   75.29  49.426667  ...  0.0  0.0  0.0  0.0  1.0
2        8.0  0.0  19732.80   47.25  50.673333  ...  0.0  0.0  1.0  0.0  1.0
3       11.0  2.0     96.33   59.01  56.473333  ...  1.0  0.0  1.0  0.0  1.0
4       14.0  2.0  52004.80   28.14  25.140000  ...  0.0  0.0  0.0  1.0  0.0
...      ...  ...       ...     ...        ...  ...  ...  ...  ...  ...  ...
2065  3821.0  0.0  78851.30   29.04  48.373333  ...  1.0  0.0  1.0  0.0  1.0
2066  3822.0  1.0  17540.70   36.20  62.786667  ...  1.0  1.0  0.0  0.0  1.0
2067  3823.0  0.0  83891.90   74.40  61.020000  ...  0.0  0.0  0.0  0.0  1.0
2068  3824.0  2.0  28220.80   38.95  38.766667  ...  0.0  0.0  1.0  0.0  1.0
2069  3825.0  0.0  28589.10  100.28  15.600000  ...  1.0  0.0  1.0  0.0  1.0

[2070 rows x 25 columns]
Accuracy score: 0.852657
Confusion Matr

In [15]:
#In this cell, testing accuracy/cv scores for k = 50 features and Random Forest Classifier
#Feature selection on combined dataset (Chi-2 Square Scoring)

combined_SelectKBest_chi2 = SelectKBest(score_func=chi2, k=50).fit_transform(combined_one_hot,y_train)
print(combined_SelectKBest_chi2.shape)

combined_SelectedFeatures= pd.DataFrame(combined_SelectKBest_chi2)
print(combined_SelectedFeatures)

#This step cuts down the size of our matrix by selecting only a designated amount (k) of the best features from our combined dataset

#Perform Train-Test split on the selected feature data

X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_SelectKBest_chi2, y_train, test_size = .20, random_state = 1)

#Build random forest classifier on combined dataset (feature ranking)

rf =RandomForestClassifier()
rf_combined = rf.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(rf.score(X_Test, Y_Test)))
rf_predictions = rf.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, rf_predictions))
print("Classification Report")
print(classification_report(Y_Test, rf_predictions))

#run cross-validation - Random Forest Model
rf_combined_cv_score = cross_val_score(rf_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(rf_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",rf_combined_cv_score.mean())
print('\n')

(2070, 50)
          0    1         2       3          4   ...   45   46   47   48   49
0        1.0  1.0  38000.00  229.64  24.393333  ...  1.0  0.0  1.0  1.0  0.0
1        6.0  2.0  29616.00   75.29  49.426667  ...  0.0  0.0  0.0  0.0  1.0
2        8.0  0.0  19732.80   47.25  50.673333  ...  0.0  0.0  1.0  0.0  1.0
3       11.0  2.0     96.33   59.01  56.473333  ...  1.0  0.0  1.0  0.0  1.0
4       14.0  2.0  52004.80   28.14  25.140000  ...  0.0  0.0  0.0  1.0  0.0
...      ...  ...       ...     ...        ...  ...  ...  ...  ...  ...  ...
2065  3821.0  0.0  78851.30   29.04  48.373333  ...  1.0  0.0  1.0  0.0  1.0
2066  3822.0  1.0  17540.70   36.20  62.786667  ...  1.0  1.0  0.0  0.0  1.0
2067  3823.0  0.0  83891.90   74.40  61.020000  ...  0.0  0.0  0.0  0.0  1.0
2068  3824.0  2.0  28220.80   38.95  38.766667  ...  0.0  0.0  1.0  0.0  1.0
2069  3825.0  0.0  28589.10  100.28  15.600000  ...  1.0  0.0  1.0  0.0  1.0

[2070 rows x 50 columns]
Accuracy score: 0.850242
Confusion Matr

In [16]:
#In this cell, testing accuracy/cv scores for k = 75 features and Random Forest Classifier
#Feature selection on combined dataset (Chi-2 Square Scoring)

combined_SelectKBest_chi2 = SelectKBest(score_func=chi2, k=75).fit_transform(combined_one_hot,y_train)
print(combined_SelectKBest_chi2.shape)

combined_SelectedFeatures= pd.DataFrame(combined_SelectKBest_chi2)
print(combined_SelectedFeatures)

#This step cuts down the size of our matrix by selecting only a designated amount (k) of the best features from our combined dataset

#Perform Train-Test split on the selected feature data

X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_SelectKBest_chi2, y_train, test_size = .20, random_state = 1)

#Build random forest classifier on combined dataset (feature ranking)

rf =RandomForestClassifier()
rf_combined = rf.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(rf.score(X_Test, Y_Test)))
rf_predictions = rf.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, rf_predictions))
print("Classification Report")
print(classification_report(Y_Test, rf_predictions))

#run cross-validation - Random Forest Model
rf_combined_cv_score = cross_val_score(rf_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(rf_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",rf_combined_cv_score.mean())
print('\n')

(2070, 75)
          0    1         2       3          4   ...   70   71   72   73   74
0        1.0  1.0  38000.00  229.64  24.393333  ...  1.0  0.0  1.0  1.0  0.0
1        6.0  2.0  29616.00   75.29  49.426667  ...  0.0  0.0  0.0  0.0  1.0
2        8.0  0.0  19732.80   47.25  50.673333  ...  0.0  0.0  1.0  0.0  1.0
3       11.0  2.0     96.33   59.01  56.473333  ...  1.0  0.0  1.0  0.0  1.0
4       14.0  2.0  52004.80   28.14  25.140000  ...  0.0  0.0  0.0  1.0  0.0
...      ...  ...       ...     ...        ...  ...  ...  ...  ...  ...  ...
2065  3821.0  0.0  78851.30   29.04  48.373333  ...  1.0  0.0  1.0  0.0  1.0
2066  3822.0  1.0  17540.70   36.20  62.786667  ...  1.0  1.0  0.0  0.0  1.0
2067  3823.0  0.0  83891.90   74.40  61.020000  ...  0.0  0.0  0.0  0.0  1.0
2068  3824.0  2.0  28220.80   38.95  38.766667  ...  0.0  0.0  1.0  0.0  1.0
2069  3825.0  0.0  28589.10  100.28  15.600000  ...  1.0  0.0  1.0  0.0  1.0

[2070 rows x 75 columns]
Accuracy score: 0.864734
Confusion Matr

In [17]:
#In this cell, testing accuracy/cv scores for k = 150 features and Random Forest Classifier
#Feature selection on combined dataset (Chi-2 Square Scoring)

combined_SelectKBest_chi2 = SelectKBest(score_func=chi2, k=150).fit_transform(combined_one_hot,y_train)
print(combined_SelectKBest_chi2.shape)

combined_SelectedFeatures= pd.DataFrame(combined_SelectKBest_chi2)
print(combined_SelectedFeatures)

#This step cuts down the size of our matrix by selecting only a designated amount (k) of the best features from our combined dataset

#Perform Train-Test split on the selected feature data

X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_SelectKBest_chi2, y_train, test_size = .20, random_state = 1)

#Build random forest classifier on combined dataset (feature ranking)

rf =RandomForestClassifier()
rf_combined = rf.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(rf.score(X_Test, Y_Test)))
rf_predictions = rf.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, rf_predictions))
print("Classification Report")
print(classification_report(Y_Test, rf_predictions))

#run cross-validation - Random Forest Model
rf_combined_cv_score = cross_val_score(rf_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(rf_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",rf_combined_cv_score.mean())
print('\n')

(2070, 150)
         0    1         2       3          4    ...  145  146  147  148  149
0        1.0  1.0  38000.00  229.64  24.393333  ...  1.0  0.0  1.0  1.0  0.0
1        6.0  2.0  29616.00   75.29  49.426667  ...  0.0  0.0  0.0  0.0  1.0
2        8.0  0.0  19732.80   47.25  50.673333  ...  0.0  0.0  1.0  0.0  1.0
3       11.0  2.0     96.33   59.01  56.473333  ...  1.0  0.0  1.0  0.0  1.0
4       14.0  2.0  52004.80   28.14  25.140000  ...  0.0  0.0  0.0  1.0  0.0
...      ...  ...       ...     ...        ...  ...  ...  ...  ...  ...  ...
2065  3821.0  0.0  78851.30   29.04  48.373333  ...  1.0  0.0  1.0  0.0  1.0
2066  3822.0  1.0  17540.70   36.20  62.786667  ...  1.0  1.0  0.0  0.0  1.0
2067  3823.0  0.0  83891.90   74.40  61.020000  ...  0.0  0.0  0.0  0.0  1.0
2068  3824.0  2.0  28220.80   38.95  38.766667  ...  0.0  0.0  1.0  0.0  1.0
2069  3825.0  0.0  28589.10  100.28  15.600000  ...  1.0  0.0  1.0  0.0  1.0

[2070 rows x 150 columns]
Accuracy score: 0.850242
Confusion Ma

Feature Selection - Filter Method (Testing different values for k with a Gradient Boosting Classifier)

In [18]:
#In this cell, testing accuracy/cv scores for k = 10 features and Gradient Boosting
#Feature selection on combined dataset (Chi-2 Square Scoring)

combined_SelectKBest_chi2 = SelectKBest(score_func=chi2, k=10).fit_transform(combined_one_hot,y_train)
print(combined_SelectKBest_chi2.shape)

combined_SelectedFeatures= pd.DataFrame(combined_SelectKBest_chi2)
print(combined_SelectedFeatures)

#This step cuts down the size of our matrix by selecting only a designated amount (k) of the best features from our combined dataset

#Perform Train-Test split on the selected feature data

X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_SelectKBest_chi2, y_train, test_size = .20, random_state = 1)

#Build gradient boosting classifier on combined dataset (feature ranking)

gb =GradientBoostingClassifier()
gb_combined = gb.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(gb.score(X_Test, Y_Test)))
gb_predictions = gb.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, gb_predictions))
print("Classification Report")
print(classification_report(Y_Test, gb_predictions))

#run cross-validation - Gradient Boosting Model
gb_combined_cv_score = cross_val_score(gb_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(gb_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",gb_combined_cv_score.mean())
print('\n')

(2070, 10)
           0    1         2       3      4     5       6    7    8    9
0        1.0  1.0  38000.00  229.64  23.56  0.00  206.08  0.0  0.0  1.0
1        6.0  2.0  29616.00   75.29  29.78  0.00   45.50  1.0  1.0  0.0
2        8.0  0.0  19732.80   47.25  24.81  0.00   22.44  1.0  1.0  0.0
3       11.0  2.0     96.33   59.01  26.13  0.00   32.88  1.0  0.0  1.0
4       14.0  2.0  52004.80   28.14   5.03  0.00   23.11  0.0  1.0  0.0
...      ...  ...       ...     ...    ...   ...     ...  ...  ...  ...
2065  3821.0  0.0  78851.30   29.04   0.37  0.00   28.66  0.0  0.0  1.0
2066  3822.0  1.0  17540.70   36.20  22.17  0.57   13.45  0.0  0.0  1.0
2067  3823.0  0.0  83891.90   74.40  28.92  0.00   45.47  0.0  1.0  0.0
2068  3824.0  2.0  28220.80   38.95  26.49  0.00   12.46  0.0  1.0  0.0
2069  3825.0  0.0  28589.10  100.28  13.19  0.00   87.09  0.0  0.0  1.0

[2070 rows x 10 columns]
Accuracy score: 0.845411
Confusion Matrix:
[[116  34]
 [ 30 234]]
Classification Report
           

In [19]:
#In this cell, testing accuracy/cv scores for k = 25 features and Gradient Boosting
#Feature selection on combined dataset (Chi-2 Square Scoring)

combined_SelectKBest_chi2 = SelectKBest(score_func=chi2, k=25).fit_transform(combined_one_hot,y_train)
print(combined_SelectKBest_chi2.shape)

combined_SelectedFeatures= pd.DataFrame(combined_SelectKBest_chi2)
print(combined_SelectedFeatures)

#This step cuts down the size of our matrix by selecting only a designated amount (k) of the best features from our combined dataset

#Perform Train-Test split on the selected feature data

X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_SelectKBest_chi2, y_train, test_size = .20, random_state = 1)

#Build gradient boosting classifier on combined dataset (feature ranking)

gb =GradientBoostingClassifier()
gb_combined = gb.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(gb.score(X_Test, Y_Test)))
gb_predictions = gb.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, gb_predictions))
print("Classification Report")
print(classification_report(Y_Test, gb_predictions))

#run cross-validation - Gradient Boosting Model
gb_combined_cv_score = cross_val_score(gb_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(gb_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",gb_combined_cv_score.mean())
print('\n')

(2070, 25)
          0    1         2       3          4   ...   20   21   22   23   24
0        1.0  1.0  38000.00  229.64  24.393333  ...  1.0  0.0  1.0  1.0  0.0
1        6.0  2.0  29616.00   75.29  49.426667  ...  0.0  0.0  0.0  0.0  1.0
2        8.0  0.0  19732.80   47.25  50.673333  ...  0.0  0.0  1.0  0.0  1.0
3       11.0  2.0     96.33   59.01  56.473333  ...  1.0  0.0  1.0  0.0  1.0
4       14.0  2.0  52004.80   28.14  25.140000  ...  0.0  0.0  0.0  1.0  0.0
...      ...  ...       ...     ...        ...  ...  ...  ...  ...  ...  ...
2065  3821.0  0.0  78851.30   29.04  48.373333  ...  1.0  0.0  1.0  0.0  1.0
2066  3822.0  1.0  17540.70   36.20  62.786667  ...  1.0  1.0  0.0  0.0  1.0
2067  3823.0  0.0  83891.90   74.40  61.020000  ...  0.0  0.0  0.0  0.0  1.0
2068  3824.0  2.0  28220.80   38.95  38.766667  ...  0.0  0.0  1.0  0.0  1.0
2069  3825.0  0.0  28589.10  100.28  15.600000  ...  1.0  0.0  1.0  0.0  1.0

[2070 rows x 25 columns]
Accuracy score: 0.864734
Confusion Matr

In [20]:
#In this cell, testing accuracy/cv scores for k = 50 features and Gradient Boosting
#Feature selection on combined dataset (Chi-2 Square Scoring)

combined_SelectKBest_chi2 = SelectKBest(score_func=chi2, k=50).fit_transform(combined_one_hot,y_train)
print(combined_SelectKBest_chi2.shape)

combined_SelectedFeatures= pd.DataFrame(combined_SelectKBest_chi2)
print(combined_SelectedFeatures)

#This step cuts down the size of our matrix by selecting only a designated amount (k) of the best features from our combined dataset

#Perform Train-Test split on the selected feature data

X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_SelectKBest_chi2, y_train, test_size = .20, random_state = 1)

#Build gradient boosting classifier on combined dataset (feature ranking)

gb =GradientBoostingClassifier()
gb_combined = gb.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(gb.score(X_Test, Y_Test)))
gb_predictions = gb.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, gb_predictions))
print("Classification Report")
print(classification_report(Y_Test, gb_predictions))

#run cross-validation - Gradient Boosting Model
gb_combined_cv_score = cross_val_score(gb_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(gb_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",gb_combined_cv_score.mean())
print('\n')

(2070, 50)
          0    1         2       3          4   ...   45   46   47   48   49
0        1.0  1.0  38000.00  229.64  24.393333  ...  1.0  0.0  1.0  1.0  0.0
1        6.0  2.0  29616.00   75.29  49.426667  ...  0.0  0.0  0.0  0.0  1.0
2        8.0  0.0  19732.80   47.25  50.673333  ...  0.0  0.0  1.0  0.0  1.0
3       11.0  2.0     96.33   59.01  56.473333  ...  1.0  0.0  1.0  0.0  1.0
4       14.0  2.0  52004.80   28.14  25.140000  ...  0.0  0.0  0.0  1.0  0.0
...      ...  ...       ...     ...        ...  ...  ...  ...  ...  ...  ...
2065  3821.0  0.0  78851.30   29.04  48.373333  ...  1.0  0.0  1.0  0.0  1.0
2066  3822.0  1.0  17540.70   36.20  62.786667  ...  1.0  1.0  0.0  0.0  1.0
2067  3823.0  0.0  83891.90   74.40  61.020000  ...  0.0  0.0  0.0  0.0  1.0
2068  3824.0  2.0  28220.80   38.95  38.766667  ...  0.0  0.0  1.0  0.0  1.0
2069  3825.0  0.0  28589.10  100.28  15.600000  ...  1.0  0.0  1.0  0.0  1.0

[2070 rows x 50 columns]
Accuracy score: 0.864734
Confusion Matr

In [21]:
#In this cell, testing accuracy/cv scores for k = 75 features and Gradient Boosting
#Feature selection on combined dataset (Chi-2 Square Scoring)

combined_SelectKBest_chi2 = SelectKBest(score_func=chi2, k=75).fit_transform(combined_one_hot,y_train)
print(combined_SelectKBest_chi2.shape)

combined_SelectedFeatures= pd.DataFrame(combined_SelectKBest_chi2)
print(combined_SelectedFeatures)

#This step cuts down the size of our matrix by selecting only a designated amount (k) of the best features from our combined dataset

#Perform Train-Test split on the selected feature data

X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_SelectKBest_chi2, y_train, test_size = .20, random_state = 1)

#Build gradient boosting classifier on combined dataset (feature ranking)

gb =GradientBoostingClassifier()
gb_combined = gb.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(gb.score(X_Test, Y_Test)))
gb_predictions = gb.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, gb_predictions))
print("Classification Report")
print(classification_report(Y_Test, gb_predictions))

#run cross-validation - Gradient Boosting Model
gb_combined_cv_score = cross_val_score(gb_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(gb_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",gb_combined_cv_score.mean())
print('\n')

(2070, 75)
          0    1         2       3          4   ...   70   71   72   73   74
0        1.0  1.0  38000.00  229.64  24.393333  ...  1.0  0.0  1.0  1.0  0.0
1        6.0  2.0  29616.00   75.29  49.426667  ...  0.0  0.0  0.0  0.0  1.0
2        8.0  0.0  19732.80   47.25  50.673333  ...  0.0  0.0  1.0  0.0  1.0
3       11.0  2.0     96.33   59.01  56.473333  ...  1.0  0.0  1.0  0.0  1.0
4       14.0  2.0  52004.80   28.14  25.140000  ...  0.0  0.0  0.0  1.0  0.0
...      ...  ...       ...     ...        ...  ...  ...  ...  ...  ...  ...
2065  3821.0  0.0  78851.30   29.04  48.373333  ...  1.0  0.0  1.0  0.0  1.0
2066  3822.0  1.0  17540.70   36.20  62.786667  ...  1.0  1.0  0.0  0.0  1.0
2067  3823.0  0.0  83891.90   74.40  61.020000  ...  0.0  0.0  0.0  0.0  1.0
2068  3824.0  2.0  28220.80   38.95  38.766667  ...  0.0  0.0  1.0  0.0  1.0
2069  3825.0  0.0  28589.10  100.28  15.600000  ...  1.0  0.0  1.0  0.0  1.0

[2070 rows x 75 columns]
Accuracy score: 0.859903
Confusion Matr

In [22]:
#In this cell, testing accuracy/cv scores for k = 150 features and Gradient Boosting
#Feature selection on combined dataset (Chi-2 Square Scoring)

combined_SelectKBest_chi2 = SelectKBest(score_func=chi2, k=150).fit_transform(combined_one_hot,y_train)
print(combined_SelectKBest_chi2.shape)

combined_SelectedFeatures= pd.DataFrame(combined_SelectKBest_chi2)
print(combined_SelectedFeatures)

#This step cuts down the size of our matrix by selecting only a designated amount (k) of the best features from our combined dataset

#Perform Train-Test split on the selected feature data

X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_SelectKBest_chi2, y_train, test_size = .20, random_state = 1)

#Build gradient boosting classifier on combined dataset (feature ranking)

gb =GradientBoostingClassifier()
gb_combined = gb.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(gb.score(X_Test, Y_Test)))
gb_predictions = gb.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, gb_predictions))
print("Classification Report")
print(classification_report(Y_Test, gb_predictions))

#run cross-validation - Gradient Boosting Model
gb_combined_cv_score = cross_val_score(gb_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(gb_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",gb_combined_cv_score.mean())
print('\n')

(2070, 150)
         0    1         2       3          4    ...  145  146  147  148  149
0        1.0  1.0  38000.00  229.64  24.393333  ...  1.0  0.0  1.0  1.0  0.0
1        6.0  2.0  29616.00   75.29  49.426667  ...  0.0  0.0  0.0  0.0  1.0
2        8.0  0.0  19732.80   47.25  50.673333  ...  0.0  0.0  1.0  0.0  1.0
3       11.0  2.0     96.33   59.01  56.473333  ...  1.0  0.0  1.0  0.0  1.0
4       14.0  2.0  52004.80   28.14  25.140000  ...  0.0  0.0  0.0  1.0  0.0
...      ...  ...       ...     ...        ...  ...  ...  ...  ...  ...  ...
2065  3821.0  0.0  78851.30   29.04  48.373333  ...  1.0  0.0  1.0  0.0  1.0
2066  3822.0  1.0  17540.70   36.20  62.786667  ...  1.0  1.0  0.0  0.0  1.0
2067  3823.0  0.0  83891.90   74.40  61.020000  ...  0.0  0.0  0.0  0.0  1.0
2068  3824.0  2.0  28220.80   38.95  38.766667  ...  0.0  0.0  1.0  0.0  1.0
2069  3825.0  0.0  28589.10  100.28  15.600000  ...  1.0  0.0  1.0  0.0  1.0

[2070 rows x 150 columns]
Accuracy score: 0.857488
Confusion Ma

Feature Selection - Wrapper Method (Testing different values for k with a Random Forest Classifier)

In [23]:
#Sequential Forward Search (k = 7 features)
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

clf = DecisionTreeClassifier()

sfs1 = SFS(clf, 
           k_features=7, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

feature_names = combined_one_hot.columns.values
sfs_dt = sfs1.fit(combined_one_hot, y_train, custom_feature_names = feature_names)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 378 out of 378 | elapsed:    1.8s finished

[2020-12-21 03:57:07] Features: 1/7 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 377 out of 377 | elapsed:    3.2s finished

[2020-12-21 03:57:11] Features: 2/7 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 376 out of 376 | elapsed:    3.0s finished

[2020-12-21 03:57:13] Features: 3/7 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0

In [24]:
#Sequential forward search result
print(sfs1.k_feature_names_)
print(sfs1.k_score_)

#Selecting features
features = list(sfs1.k_feature_names_)

#Train-Test Split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_one_hot[features].values, y_train, test_size = .20, random_state = 1)

#Build random forest classifier on combined dataset (feature ranking)

rf =RandomForestClassifier()
rf_combined = rf.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(rf.score(X_Test, Y_Test)))
rf_predictions = rf.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, rf_predictions))
print("Classification Report")
print(classification_report(Y_Test, rf_predictions))

#run cross-validation - Random Forest Model
rf_combined_cv_score = cross_val_score(rf_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(rf_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",rf_combined_cv_score.mean())
print('\n')

('ID', 'Children', 'Est_Income', 'Usage', 'Age', 'RatePlan', 'LongDistance')
1.0
Accuracy score: 0.867150
Confusion Matrix:
[[123  27]
 [ 28 236]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.81      0.82      0.82       150
     Current       0.90      0.89      0.90       264

    accuracy                           0.87       414
   macro avg       0.86      0.86      0.86       414
weighted avg       0.87      0.87      0.87       414

=== All Accuracy Scores ===
[0.78571429 0.42857143 0.78571429 0.64285714 0.75       0.77403846
 0.75961538 0.63461538 0.69711538 0.83653846 0.64903846 0.73557692
 0.75961538 0.75961538 0.67032967 0.67032967 0.89010989 0.85164835
 0.78021978 0.81868132]


=== Mean Accuracy Score ===
Mean Accuracy Score - ON Text:  0.7339972527472529




In [25]:
#Sequential Forward Search (k = 10 features)
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

clf = DecisionTreeClassifier()

sfs1 = SFS(clf, 
           k_features=10, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

feature_names = combined_one_hot.columns.values
sfs_dt = sfs1.fit(combined_one_hot, y_train, custom_feature_names = feature_names)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 378 out of 378 | elapsed:    1.8s finished

[2020-12-21 03:57:33] Features: 1/10 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 377 out of 377 | elapsed:    3.2s finished

[2020-12-21 03:57:36] Features: 2/10 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 376 out of 376 | elapsed:    3.0s finished

[2020-12-21 03:57:39] Features: 3/10 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:  

In [26]:
#Sequential forward search result
print(sfs1.k_feature_names_)
print(sfs1.k_score_)

#Selecting features
features = list(sfs1.k_feature_names_)

#Train-Test Split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_one_hot[features].values, y_train, test_size = .20, random_state = 1)

#Build random forest classifier on combined dataset (feature ranking)

rf =RandomForestClassifier()
rf_combined = rf.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(rf.score(X_Test, Y_Test)))
rf_predictions = rf.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, rf_predictions))
print("Classification Report")
print(classification_report(Y_Test, rf_predictions))

#run cross-validation - Random Forest Model
rf_combined_cv_score = cross_val_score(rf_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(rf_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",rf_combined_cv_score.mean())
print('\n')



('ID', 'Children', 'Est_Income', 'Usage', 'Age', 'RatePlan', 'LongDistance', 'International', 'Local', 'Dropped')
1.0
Accuracy score: 0.871981
Confusion Matrix:
[[125  25]
 [ 28 236]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.82      0.83      0.83       150
     Current       0.90      0.89      0.90       264

    accuracy                           0.87       414
   macro avg       0.86      0.86      0.86       414
weighted avg       0.87      0.87      0.87       414

=== All Accuracy Scores ===
[0.82142857 0.42857143 0.92857143 0.75       0.8125     0.83653846
 0.77403846 0.73557692 0.72115385 0.77403846 0.64903846 0.73557692
 0.88461538 0.79807692 0.70879121 0.78571429 0.89010989 0.88461538
 0.92857143 0.85164835]


=== Mean Accuracy Score ===
Mean Accuracy Score - ON Text:  0.7849587912087912




In [27]:
#Sequential Forward Search (k = 25 features)
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

clf = DecisionTreeClassifier()

sfs1 = SFS(clf, 
           k_features=25, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

feature_names = combined_one_hot.columns.values
sfs_dt = sfs1.fit(combined_one_hot, y_train, custom_feature_names = feature_names)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 378 out of 378 | elapsed:    1.8s finished

[2020-12-21 03:58:10] Features: 1/25 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 377 out of 377 | elapsed:    3.2s finished

[2020-12-21 03:58:14] Features: 2/25 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 376 out of 376 | elapsed:    2.9s finished

[2020-12-21 03:58:17] Features: 3/25 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:  

In [28]:
#Sequential forward search result
print(sfs1.k_feature_names_)
print(sfs1.k_score_)

#Selecting features
features = list(sfs1.k_feature_names_)

#Train-Test Split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_one_hot[features].values, y_train, test_size = .20, random_state = 1)

#Build random forest classifier on combined dataset (feature ranking)

rf =RandomForestClassifier()
rf_combined = rf.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(rf.score(X_Test, Y_Test)))
rf_predictions = rf.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, rf_predictions))
print("Classification Report")
print(classification_report(Y_Test, rf_predictions))

#run cross-validation - Random Forest Model
rf_combined_cv_score = cross_val_score(rf_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(rf_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",rf_combined_cv_score.mean())
print('\n')

('ID', 'Children', 'Est_Income', 'Usage', 'Age', 'RatePlan', 'LongDistance', 'International', 'Local', 'Dropped', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)
1.0
Accuracy score: 0.864734
Confusion Matrix:
[[122  28]
 [ 28 236]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.81      0.81      0.81       150
     Current       0.89      0.89      0.89       264

    accuracy                           0.86       414
   macro avg       0.85      0.85      0.85       414
weighted avg       0.86      0.86      0.86       414

=== All Accuracy Scores ===
[0.78571429 0.32142857 0.85714286 0.82142857 0.8125     0.71153846
 0.77403846 0.73557692 0.68269231 0.77403846 0.64903846 0.73557692
 0.88461538 0.73557692 0.70879121 0.70879121 0.92857143 0.96153846
 0.92857143 0.81868132]


=== Mean Accuracy Score ===
Mean Accuracy Score - ON Text:  0.7667925824175825




In [29]:
#Sequential Forward Search (k = 50 features)
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

clf = DecisionTreeClassifier()

sfs1 = SFS(clf, 
           k_features=50, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

feature_names = combined_one_hot.columns.values
sfs_dt = sfs1.fit(combined_one_hot, y_train, custom_feature_names = feature_names)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 378 out of 378 | elapsed:    1.8s finished

[2020-12-21 03:59:55] Features: 1/50 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 377 out of 377 | elapsed:    3.2s finished

[2020-12-21 03:59:58] Features: 2/50 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 376 out of 376 | elapsed:    2.9s finished

[2020-12-21 04:00:01] Features: 3/50 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:  

In [30]:
#Sequential forward search result
print(sfs1.k_feature_names_)
print(sfs1.k_score_)

#Selecting features
features = list(sfs1.k_feature_names_)

#Train-Test Split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_one_hot[features].values, y_train, test_size = .20, random_state = 1)

#Build random forest classifier on combined dataset (feature ranking)

rf =RandomForestClassifier()
rf_combined = rf.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(rf.score(X_Test, Y_Test)))
rf_predictions = rf.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, rf_predictions))
print("Classification Report")
print(classification_report(Y_Test, rf_predictions))

#run cross-validation - Random Forest Model
rf_combined_cv_score = cross_val_score(rf_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(rf_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",rf_combined_cv_score.mean())
print('\n')

('ID', 'Children', 'Est_Income', 'Usage', 'Age', 'RatePlan', 'LongDistance', 'International', 'Local', 'Dropped', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39)
1.0
Accuracy score: 0.867150
Confusion Matrix:
[[124  26]
 [ 29 235]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.81      0.83      0.82       150
     Current       0.90      0.89      0.90       264

    accuracy                           0.87       414
   macro avg       0.86      0.86      0.86       414
weighted avg       0.87      0.87      0.87       414

=== All Accuracy Scores ===
[0.89285714 0.39285714 0.85714286 0.67857143 0.8125     0.77403846
 0.77403846 0.67307692 0.75961538 0.67307692 0.71153846 0.73557692
 0.96153846 0.79807692 0.70879121 0.63736264 0.85714286 0.96153846
 0.85714286 0.81868132]


=== Mean Accuracy Score ===
Mean Accuracy Score - ON Text:  0.

Feature Selection - Wrapper Method (Testing different values for k with a Gradient Boosting Classifier)

In [31]:
#Sequential Forward Search (k = 7 features)
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

clf = DecisionTreeClassifier()

sfs1 = SFS(clf, 
           k_features=7, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

feature_names = combined_one_hot.columns.values
sfs_dt = sfs1.fit(combined_one_hot, y_train, custom_feature_names = feature_names)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 378 out of 378 | elapsed:    1.8s finished

[2020-12-21 04:03:45] Features: 1/7 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 377 out of 377 | elapsed:    3.2s finished

[2020-12-21 04:03:48] Features: 2/7 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 376 out of 376 | elapsed:    3.0s finished

[2020-12-21 04:03:51] Features: 3/7 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0

In [32]:
#Sequential forward search result
print(sfs1.k_feature_names_)
print(sfs1.k_score_)

#Selecting features
features = list(sfs1.k_feature_names_)

#Train-Test Split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_one_hot[features].values, y_train, test_size = .20, random_state = 1)

#Build random forest classifier on combined dataset (feature ranking)

#Build gradient boosting classifier on combined dataset (feature ranking)

gb =GradientBoostingClassifier()
gb_combined = gb.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(gb.score(X_Test, Y_Test)))
gb_predictions = gb.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, gb_predictions))
print("Classification Report")
print(classification_report(Y_Test, gb_predictions))

#run cross-validation - Gradient Boosting Model
gb_combined_cv_score = cross_val_score(gb_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(gb_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",gb_combined_cv_score.mean())
print('\n')

('ID', 'Children', 'Est_Income', 'Usage', 'Age', 'RatePlan', 'LongDistance')
1.0
Accuracy score: 0.818841
Confusion Matrix:
[[113  37]
 [ 38 226]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.75      0.75      0.75       150
     Current       0.86      0.86      0.86       264

    accuracy                           0.82       414
   macro avg       0.80      0.80      0.80       414
weighted avg       0.82      0.82      0.82       414

=== All Accuracy Scores ===
[0.67857143 0.42857143 0.82142857 0.57142857 0.83653846 0.77403846
 0.73557692 0.65865385 0.69711538 0.73557692 0.58653846 0.73557692
 0.82211538 0.73557692 0.7032967  0.67032967 0.78571429 0.77472527
 0.70879121 0.78021978]


=== Mean Accuracy Score ===
Mean Accuracy Score - ON Text:  0.7120192307692308




In [33]:
#Sequential Forward Search (k = 10 features)
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

clf = DecisionTreeClassifier()

sfs1 = SFS(clf, 
           k_features=10, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

feature_names = combined_one_hot.columns.values
sfs_dt = sfs1.fit(combined_one_hot, y_train, custom_feature_names = feature_names)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 378 out of 378 | elapsed:    1.8s finished

[2020-12-21 04:04:09] Features: 1/10 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 377 out of 377 | elapsed:    3.2s finished

[2020-12-21 04:04:12] Features: 2/10 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 376 out of 376 | elapsed:    3.0s finished

[2020-12-21 04:04:15] Features: 3/10 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:  

In [34]:
#Sequential forward search result
print(sfs1.k_feature_names_)
print(sfs1.k_score_)

#Selecting features
features = list(sfs1.k_feature_names_)

#Train-Test Split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_one_hot[features].values, y_train, test_size = .20, random_state = 1)

#Build random forest classifier on combined dataset (feature ranking)

#Build gradient boosting classifier on combined dataset (feature ranking)

gb =GradientBoostingClassifier()
gb_combined = gb.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(gb.score(X_Test, Y_Test)))
gb_predictions = gb.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, gb_predictions))
print("Classification Report")
print(classification_report(Y_Test, gb_predictions))

#run cross-validation - Gradient Boosting Model
gb_combined_cv_score = cross_val_score(gb_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(gb_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",gb_combined_cv_score.mean())
print('\n')

('ID', 'Children', 'Est_Income', 'Usage', 'Age', 'RatePlan', 'LongDistance', 'International', 'Local', 'Dropped')
1.0
Accuracy score: 0.826087
Confusion Matrix:
[[112  38]
 [ 34 230]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.77      0.75      0.76       150
     Current       0.86      0.87      0.86       264

    accuracy                           0.83       414
   macro avg       0.81      0.81      0.81       414
weighted avg       0.83      0.83      0.83       414

=== All Accuracy Scores ===
[0.67857143 0.60714286 0.82142857 0.64285714 0.8125     0.875
 0.73557692 0.67307692 0.65865385 0.77403846 0.64903846 0.67307692
 0.78365385 0.73557692 0.74725275 0.74725275 0.85714286 0.85164835
 0.81868132 0.74175824]


=== Mean Accuracy Score ===
Mean Accuracy Score - ON Text:  0.7441964285714286




In [35]:
#Sequential Forward Search (k = 25 features)
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

clf = DecisionTreeClassifier()

sfs1 = SFS(clf, 
           k_features=25, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

feature_names = combined_one_hot.columns.values
sfs_dt = sfs1.fit(combined_one_hot, y_train, custom_feature_names = feature_names)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 378 out of 378 | elapsed:    1.8s finished

[2020-12-21 04:04:46] Features: 1/25 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 377 out of 377 | elapsed:    3.2s finished

[2020-12-21 04:04:49] Features: 2/25 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 376 out of 376 | elapsed:    3.0s finished

[2020-12-21 04:04:52] Features: 3/25 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:  

In [36]:
#Sequential forward search result
print(sfs1.k_feature_names_)
print(sfs1.k_score_)

#Selecting features
features = list(sfs1.k_feature_names_)

#Train-Test Split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_one_hot[features].values, y_train, test_size = .20, random_state = 1)

#Build random forest classifier on combined dataset (feature ranking)

#Build gradient boosting classifier on combined dataset (feature ranking)

gb =GradientBoostingClassifier()
gb_combined = gb.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(gb.score(X_Test, Y_Test)))
gb_predictions = gb.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, gb_predictions))
print("Classification Report")
print(classification_report(Y_Test, gb_predictions))

#run cross-validation - Gradient Boosting Model
gb_combined_cv_score = cross_val_score(gb_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(gb_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",gb_combined_cv_score.mean())
print('\n')

('ID', 'Children', 'Est_Income', 'Usage', 'Age', 'RatePlan', 'LongDistance', 'International', 'Local', 'Dropped', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)
1.0
Accuracy score: 0.833333
Confusion Matrix:
[[113  37]
 [ 32 232]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.78      0.75      0.77       150
     Current       0.86      0.88      0.87       264

    accuracy                           0.83       414
   macro avg       0.82      0.82      0.82       414
weighted avg       0.83      0.83      0.83       414

=== All Accuracy Scores ===
[0.60714286 0.57142857 0.82142857 0.71428571 0.875      0.83653846
 0.77403846 0.53365385 0.69711538 0.77403846 0.71153846 0.67307692
 0.75961538 0.73557692 0.74725275 0.63736264 0.71428571 0.85164835
 0.81868132 0.78021978]


=== Mean Accuracy Score ===
Mean Accuracy Score - ON Text:  0.7316964285714287




In [37]:
#Sequential Forward Search (k = 50 features)
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

clf = DecisionTreeClassifier()

sfs1 = SFS(clf, 
           k_features=50, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

feature_names = combined_one_hot.columns.values
sfs_dt = sfs1.fit(combined_one_hot, y_train, custom_feature_names = feature_names)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 378 out of 378 | elapsed:    1.9s finished

[2020-12-21 04:06:31] Features: 1/50 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 377 out of 377 | elapsed:    3.2s finished

[2020-12-21 04:06:34] Features: 2/50 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 376 out of 376 | elapsed:    3.0s finished

[2020-12-21 04:06:37] Features: 3/50 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:  

In [38]:
#Sequential forward search result
print(sfs1.k_feature_names_)
print(sfs1.k_score_)

#Selecting features
features = list(sfs1.k_feature_names_)

#Train-Test Split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(combined_one_hot[features].values, y_train, test_size = .20, random_state = 1)

#Build random forest classifier on combined dataset (feature ranking)

#Build gradient boosting classifier on combined dataset (feature ranking)

gb =GradientBoostingClassifier()
gb_combined = gb.fit(X_Train, Y_Train)
print("Accuracy score: {0:.6f}".format(gb.score(X_Test, Y_Test)))
gb_predictions = gb.predict(X_Test)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test, gb_predictions))
print("Classification Report")
print(classification_report(Y_Test, gb_predictions))

#run cross-validation - Gradient Boosting Model
gb_combined_cv_score = cross_val_score(gb_combined, X_Test, Y_Test, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(gb_combined_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",gb_combined_cv_score.mean())
print('\n')

('ID', 'Children', 'Est_Income', 'Usage', 'Age', 'RatePlan', 'LongDistance', 'International', 'Local', 'Dropped', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39)
1.0
Accuracy score: 0.830918
Confusion Matrix:
[[110  40]
 [ 30 234]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.79      0.73      0.76       150
     Current       0.85      0.89      0.87       264

    accuracy                           0.83       414
   macro avg       0.82      0.81      0.81       414
weighted avg       0.83      0.83      0.83       414

=== All Accuracy Scores ===
[0.46428571 0.5        0.82142857 0.67857143 0.875      0.875
 0.73557692 0.57211538 0.72115385 0.77403846 0.6875     0.71153846
 0.82211538 0.73557692 0.70879121 0.5989011  0.67582418 0.81318681
 0.81868132 0.78021978]


=== Mean Accuracy Score ===
Mean Accuracy Score - ON Text:  0.71847