In [None]:
from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive


Mounted at /gdrive
/gdrive


In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel


#NLTK-------------------------------
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

# Import libraries for feature 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
#Read files
textfile = r'/gdrive/My Drive/Comments.csv'
textData = pd.read_csv(textfile) #creates a dataframe

CustInfofile = r'/gdrive/My Drive/Customers.csv'
CustInfoData = pd.read_csv(CustInfofile)  #creates a dataframe

print(textData.shape)
print(CustInfoData.shape)


(2070, 2)
(2070, 17)


In [None]:
#Extract target column from Customer Info file
y_train = CustInfoData["TARGET"]
X_train = CustInfoData.drop(columns=["TARGET"]) #extracting training data without the target column
                     

print(y_train.shape)

(2070,)


In [None]:
#Tokenize - Split the sentences to lists of words
textData['CommentsTokenized'] = textData['Comments'].apply(word_tokenize)

export_csv = textData.to_csv(r'/gdrive/My Drive/TextDataTokenized1.csv')




In [None]:
# Use English stemmer.
stemmer = LancasterStemmer()

#Now do stemming - create a new dataframe to store stemmed version
newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedStemmed2'] = textData['CommentsTokenized'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

export_csv = newTextData.to_csv(r'/gdrive/My Drive/newTextDataLS.csv')

In [None]:
# Use English stemmer.
stemmer = PorterStemmer()

#Now do stemming - create a new dataframe to store stemmed version
newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedStemmed1'] = textData['CommentsTokenized'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

export_csv = newTextData.to_csv(r'/gdrive/My Drive/newTextDataPS.csv')

In [None]:
# Use English stemmer.
stemmer = SnowballStemmer("english")

#Now do stemming - create a new dataframe to store stemmed version
newTextData=pd.DataFrame()
newTextData=textData.drop(columns=["CommentsTokenized","Comments"])
newTextData['CommentsTokenizedStemmed'] = textData['CommentsTokenized'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

export_csv = newTextData.to_csv(r'/gdrive/My Drive/newTextDataSS.csv')


In [None]:

#Join stemmed strings
newTextData['CommentsTokenizedStemmed'] = newTextData['CommentsTokenizedStemmed'].apply(lambda x: " ".join(x))

export_csv = newTextData.to_csv(r'/gdrive/My Drive/newTextData-Joined.csv')

In [None]:
#Do Bag-Of-Words model - Term - Document Matrix
#Learn the vocabulary dictionary and return term-document matrix.
#count_vect = CountVectorizer(stop_words=None)
count_vect = CountVectorizer(stop_words='english',lowercase=False)
TD_counts = count_vect.fit_transform(newTextData.CommentsTokenizedStemmed)
print(TD_counts.shape)
print(TD_counts.dtype)
print(count_vect.get_feature_names())
#print(TD_counts)
DF_TD_Counts=pd.DataFrame(TD_counts.toarray())
print(DF_TD_Counts)
export_csv = DF_TD_Counts.to_csv(r'/gdrive/My Drive/TD_counts-TokenizedStemmed.csv')


(2070, 354)
int64
['3399', '3g', 'abysm', 'access', 'accessori', 'adapt', 'add', 'addit', 'additon', 'address', 'adit', 'adress', 'advertis', 'afraid', 'alway', 'angel', 'angri', 'ani', 'anoth', 'anyth', 'anytim', 'area', 'asap', 'ask', 'bad', 'basic', 'bateri', 'batteri', 'becaus', 'believ', 'better', 'bigger', 'book', 'bought', 'brain', 'bring', 'built', 'busi', 'button', 'buy', 'cancel', 'cancer', 'car', 'care', 'carrier', 'caus', 'cc', 'cell', 'certain', 'chang', 'charg', 'charger', 'check', 'chip', 'citi', 'claim', 'cleariti', 'cold', 'comapr', 'compani', 'compar', 'competit', 'complain', 'complaint', 'concept', 'connect', 'consisit', 'consist', 'constan', 'contact', 'continu', 'contract', 'correct', 'cost', 'coupl', 'cover', 'coverag', 'creat', 'credit', 'cstmer', 'cstmr', 'current', 'cust', 'custom', 'customr', 'date', 'day', 'dead', 'decent', 'defect', 'deo', 'did', 'die', 'differ', 'difficult', 'digiti', 'direct', 'disabl', 'doe', 'don', 'dont', 'drop', 'dure', 'easier', 'effe

In [None]:
#Compute TF-IDF Matrix
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(TD_counts)
print(X_train_tfidf.shape)
DF_TF_IDF=pd.DataFrame(X_train_tfidf.toarray())
print(DF_TF_IDF)
export_csv= DF_TF_IDF.to_csv(r'/gdrive/My Drive/TFIDF_counts-TokenizedStemmed.csv')


(2070, 354)
      0    1    2    3        4    5    6    7    8         9    ...  344  \
0     0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
1     0.0  0.0  0.0  0.0  0.27568  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
2     0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
3     0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
4     0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
...   ...  ...  ...  ...      ...  ...  ...  ...  ...       ...  ...  ...   
2065  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
2066  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
2067  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
2068  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.772949  ...  0.0   
2069  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   

      345  346       347  348  349  350  351  352  353  
0     

In [None]:
#merge files
DF_TF_IDF['ID'] = textData['ID']
combined = pd.merge(X_train, DF_TF_IDF, on ='ID')
print(combined)


export_csv= combined.to_csv(r'/gdrive/My Drive/Combined2-Cust+TFIDF+SelectedFeatures.csv')

        ID Sex Status  Children  Est_Income Car_Owner   Usage        Age  \
0        1   F      S         1    38000.00         N  229.64  24.393333   
1        6   M      M         2    29616.00         N   75.29  49.426667   
2        8   M      M         0    19732.80         N   47.25  50.673333   
3       11   M      S         2       96.33         N   59.01  56.473333   
4       14   F      M         2    52004.80         N   28.14  25.140000   
...    ...  ..    ...       ...         ...       ...     ...        ...   
2065  3821   F      S         0    78851.30         N   29.04  48.373333   
2066  3822   F      S         1    17540.70         Y   36.20  62.786667   
2067  3823   F      M         0    83891.90         Y   74.40  61.020000   
2068  3824   F      M         2    28220.80         N   38.95  38.766667   
2069  3825   F      S         0    28589.10         N  100.28  15.600000   

      RatePlan  LongDistance  ...  344  345  346       347  348  349  350  \
0         

In [None]:
#Do one Hot encoding for categorical features
X_cat = ["Sex","Status","Car_Owner","Paymethod","LocalBilltype","LongDistanceBilltype"]
#X_cat = combined.select_dtypes(exclude=['int','float64'])
print(X_cat)
combined_one_hot = pd.get_dummies(combined,columns=X_cat)
print(combined_one_hot.shape)
export_csv= combined_one_hot.to_csv(r'/gdrive/My Drive/combined_one_hot.csv')



['Sex', 'Status', 'Car_Owner', 'Paymethod', 'LocalBilltype', 'LongDistanceBilltype']
(2070, 378)


In [None]:
#Feature selection
#Suppose, we select 50 features with top 50 Fisher scores
selector = SelectKBest(k=50)
#selector = SelectKBest(score_func=chi2, k=25)

#new_DF_TF_IDF = SelectKBest(score_func=chi2, k=50).fit_transform(DF_TF_IDF,y_train)
new_DF_TF_IDF = selector.fit_transform(combined_one_hot,y_train)
print(new_DF_TF_IDF.shape)

feature_names_out = selector.get_support(indices=True)
print(feature_names_out)

DF_TF_IDF_SelectedFeatures= pd.DataFrame(new_DF_TF_IDF)
print(DF_TF_IDF_SelectedFeatures)

export_csv= DF_TF_IDF_SelectedFeatures.to_csv(r'/gdrive/My Drive/TFIDF_counts-Selected Features.csv')


(2070, 50)
[  1   2   3   6   7  14  17  32  37  38  40  49  50  84  97 107 123 128
 129 135 151 164 174 178 184 193 208 209 230 234 244 255 266 277 292 298
 300 304 306 307 320 325 338 345 357 364 365 367 368 371]
       0         1       2      3     4    5    6         7         8    9   \
0     1.0  38000.00  229.64  23.56  0.00  0.0  0.0  0.000000  0.000000  0.0   
1     2.0  29616.00   75.29  29.78  0.00  0.0  0.0  0.000000  0.000000  0.0   
2     0.0  19732.80   47.25  24.81  0.00  0.0  0.0  0.000000  0.000000  0.0   
3     2.0     96.33   59.01  26.13  0.00  0.0  0.0  0.000000  0.000000  0.0   
4     2.0  52004.80   28.14   5.03  0.00  0.0  0.0  0.000000  0.000000  0.0   
...   ...       ...     ...    ...   ...  ...  ...       ...       ...  ...   
2065  0.0  78851.30   29.04   0.37  0.00  0.0  0.0  0.466708  0.443664  0.0   
2066  1.0  17540.70   36.20  22.17  0.57  0.0  0.0  0.466708  0.443664  0.0   
2067  0.0  83891.90   74.40  28.92  0.00  0.0  0.0  0.466708  0.443664  0.

In [None]:
from sklearn.model_selection import train_test_split
clf=RandomForestClassifier()
DF_TF_IDF_SelectedFeatures, X_test, y_train, y_test = train_test_split(DF_TF_IDF_SelectedFeatures, y_train, test_size=0.2, random_state=1)
RF_Comb = clf.fit(DF_TF_IDF_SelectedFeatures,y_train)
print("Training Accuracy score (training): {0:.6f}".format(clf.score(DF_TF_IDF_SelectedFeatures, y_train)))
rf_predictions = clf.predict(X_test)
print("Test Accuracy:", metrics.accuracy_score(y_test,rf_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))
print("Classification Report")
print(classification_report(y_test, rf_predictions))

Training Accuracy score (training): 0.930556
Test Accuracy: 0.8743961352657005
Confusion Matrix:
[[130  20]
 [ 32 232]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.80      0.87      0.83       150
     Current       0.92      0.88      0.90       264

    accuracy                           0.87       414
   macro avg       0.86      0.87      0.87       414
weighted avg       0.88      0.87      0.88       414



In [None]:
#run cross-validation - COMBINED Data
rf_Comb_cv_score = cross_val_score(RF_Comb, DF_TF_IDF_SelectedFeatures, y_train, cv=20, scoring="balanced_accuracy")
print("=== All Accuracy Scores ===")
print(rf_Comb_cv_score)
print('\n')
print("=== Mean Accuracy Score ===")
print("Mean Accuracy Score - ON Text: ",rf_Comb_cv_score.mean())
print('\n')

=== All Accuracy Scores ===
[0.88939394 0.80378788 0.77348485 0.90550595 0.73958333 0.8422619
 0.79613095 0.79613095 0.74032738 0.90550595 0.7485119  0.70833333
 0.82738095 0.69196429 0.8735119  0.78794643 0.8578869  0.75669643
 0.82738095 0.8       ]


=== Mean Accuracy Score ===
Mean Accuracy Score - ON Text:  0.8035863095238096




In [None]:
#Construct a Random Forest Classifier WITHOUT text data
print(CustInfoData.shape)
X_train1=combined_one_hot.iloc[:,1:10]
#X_train2=combined_one_hot.iloc[:,60:]
X_train2=combined_one_hot.iloc[:,35:]
print(X_train1.shape)
print(X_train1.head())
print(X_train2.shape)
print(X_train2.head())
combined1=pd.concat([X_train1, X_train2], axis=1)
print(combined1.shape)
print(combined1.head())
export_csv= combined1.to_csv(r'/gdrive/My Drive/combined1.csv')

(2070, 17)
(2070, 9)
   Children  Est_Income   Usage        Age  RatePlan  LongDistance  \
0         1    38000.00  229.64  24.393333         3         23.56   
1         2    29616.00   75.29  49.426667         2         29.78   
2         0    19732.80   47.25  50.673333         3         24.81   
3         2       96.33   59.01  56.473333         1         26.13   
4         2    52004.80   28.14  25.140000         1          5.03   

   International   Local  Dropped  
0            0.0  206.08        0  
1            0.0   45.50        0  
2            0.0   22.44        0  
3            0.0   32.88        1  
4            0.0   23.11        0  
(2070, 343)
    25   26   27   28   29   30   31   32   33   34  ...  Status_S  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...         1   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...         0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...         0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.

In [None]:
#Customer Info One-Hot Encoded
DF_Combined1= pd.DataFrame(combined1)
export_csv= DF_Combined1.to_csv(r'/gdrive/My Drive/CustInfo_Onehot_encoded.csv')

In [None]:
#Do feature selection using a classification model
#clf = ExtraTreesClassifier(n_estimators=50)
#clf = GradientBoostingClassifier(n_estimators=50)
clf = DecisionTreeClassifier()
clf = clf.fit(combined_one_hot,y_train)
print(clf.feature_importances_)
#model = SelectFromModel(clf, prefit=True)
model = SelectFromModel(clf, prefit=True, max_features=7, threshold=-np.inf)
#model = SelectFromModel(clf, prefit=True)
X_new= model.transform(combined_one_hot)
X_new_SelectedFeatures= pd.DataFrame(X_new)
export_csv= X_new_SelectedFeatures.to_csv(r'/gdrive/My Drive/X_new_SelectedFeatures.csv')

#print(model.get_support())
print(X_new_SelectedFeatures)



[0.15644251 0.11368969 0.06562802 0.01254953 0.0603501  0.05578326
 0.11246673 0.00221866 0.00547783 0.00747667 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.00187125 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.00241532 0.         0.         0.         0.
 0.         0.         0.00070396 0.         0.         0.00187474
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.00135578 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.    

In [None]:

clf=RandomForestClassifier()
X_new_SelectedFeatures, X_test, y_train, y_test = train_test_split(X_new_SelectedFeatures, y_train, test_size=0.2, random_state=1)
RF_Comb = clf.fit(X_new_SelectedFeatures,y_train)
print(" Training Accuracy score (training): {0:.6f}".format(clf.score(X_new_SelectedFeatures, y_train)))
rf_predictions = clf.predict(X_test)
print("Test Accuracy:", metrics.accuracy_score(y_test,rf_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))
print("Classification Report")
print(classification_report(y_test, rf_predictions))

 Training Accuracy score (training): 1.000000
Test Accuracy: 0.8840579710144928
Confusion Matrix:
[[130  20]
 [ 28 236]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.82      0.87      0.84       150
     Current       0.92      0.89      0.91       264

    accuracy                           0.88       414
   macro avg       0.87      0.88      0.88       414
weighted avg       0.89      0.88      0.88       414

