In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import string
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils import shuffle

stemmer = SnowballStemmer("english")
wn_tags = {'N':wn.NOUN, 'J':wn.ADJ, 'V':wn.VERB, 'R':wn.ADV}
wnl = WordNetLemmatizer()

punctuation = list(string.punctuation) + ['..', '...']
pronouns = ['i', 'he', 'she', 'it', 'him', 'they', 'we', 'us', 'them']
extensions = ['tr', 'ion', 'tl']
stop = stopwords.words('english') + punctuation + pronouns + extensions

In [2]:
df = pd.read_excel('GMC_Complaints.xlsx', usecols = [1,2,3,4,5,6,7])

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2734 entries, 0 to 2733
Data columns (total 7 columns):
Year           2734 non-null int64
make           2734 non-null object
model          2734 non-null object
description    2734 non-null object
crashed        2734 non-null object
abs            2716 non-null object
mileage        2315 non-null float64
dtypes: float64(1), int64(1), object(5)
memory usage: 149.6+ KB


In [4]:
df.head()

Unnamed: 0,Year,make,model,description,crashed,abs,mileage
0,2003,SATURN,ION,WHILE TRAVELING ON THE HIGHWAY AND WITHOUT PRI...,N,N,
1,2003,SATURN,ION,WHILE DRIVING TRANSMISSION DOES NOT ENGAGE PRO...,N,N,
2,2003,SATURN,ION,"IN A PANIC SITUATION, THE OWNER WAS UNABLE TO ...",N,N,500.0
3,2003,SATURN,ION,THE TWO SATURN 2003 IONS I HAVE DRIVEN (INCLU...,N,Y,10600.0
4,2003,SATURN,ION,I BOUGHT A ION QUAD COUPE IN JULY OF THIS YEAR...,N,Y,6365.0


In [5]:
for col in ['Year', 'make', 'model', 'abs']:
    print(col, set(df[col]))

Year {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011}
make {'PONTIAC', 'SATURN', 'CHEVROLET'}
model {'SOLSTICE', 'COBALT', 'G5', 'SKY', 'ION', 'HHR'}
abs {nan, 'Y', 'N'}


In [6]:
def prep(df):
    df.crashed = df.crashed.map({'N' : 0, 'Y' : 1})
    df.mileage.fillna(df.mileage.mean(), inplace=True)
    
    for col in ['Year', 'make', 'model', 'abs']:
        df[col] = df[col].astype('category')
    
    X = pd.get_dummies(df.drop(['crashed', 'description'], 1))
    
    y = df.crashed
    
    return X, y

In [7]:
X, y = prep(df)

print(X.shape, y.shape)

(2734, 21) (2734,)


In [14]:
df.description[0]



In [48]:
df.description[123]

'WHEN THE TEMPERATURE IS 32 DEGREES OR LESS AND THE ENGINE IS NOT RUNNING I ATTEMPT TO START MY CAR AND IT WILL NOT START. THE IGNITION SWITCH CLICKS AND THEN THE CAR STALLS. THE SECURITY SYSTEM KICKS IN AND THE SECURITY LIGHT COMES ON AND THEN THE ODOMETER SCREEN READS "SERVICE VEHICLE SOON". THE CAR WILL NOT START FOR ABOUT 8 TO 10 MINUTES. AFTER 8 TO 10 MINUTES THE LIGHT STOPS BLINKING AND THE SERVICE VEHICLE GOES OFF AND THEN THE CAR STARTS. THIS HAS HAPPENED SEVERAL TIMES SINCE THEN AND IT OCCURS ALL DIFFERENT TIMES OF THE DAYS. AS I READ ON THE INTERNET THERE HAVE BEEN SEVERAL PEOPLE WITH THE SAME PROBLEMS. THIS IS GOING TO COST ME APPROXIMATELY $200.00 TO FIX.   *TR'

In [49]:
df.description[265]

'TL*THE CONTACT OWNS A 2005 CHEVROLET COBALT.  WHILE DRIVING 70 MPH ON THE FREEWAY, THE "REDUCE ENGINE SPEED" INDICATOR ILLUMINATED ON THE INSTRUMENT PANEL.  THE VEHICLE SHIFTED INTO LOW GEAR AND THE CONTACT COULD NOT ACCELERATE BEYOND 40 MPH, WHICH SHE CONSIDERED VERY DANGEROUS ON A HIGHWAY ALLOWING 70 MPH SPEEDS.  THE VEHICLE BEGAN OPERATING NORMALLY.  PRIOR TO THE FAILURE, SHE NOTICED THAT THE VEHICLE BEGAN STALLING WHEN SHE WOULD SLOW DOWN AT TRAFFIC LIGHTS OR STOP SIGNS.  OCCASIONALLY, THE CONTACT WOULD HAVE TO SHIFT THE VEHICLE INTO NEUTRAL SO THAT IT COULD PERFORM CORRECTLY.  ON THREE OCCASIONS, A MECHANIC STATED THAT MULTIPLE CODES WERE PRESENT AFTER DIAGNOSING THE VEHICLE.  THE MANUFACTURER STATED THAT THERE WAS NO RELEVANT RECALL TO-DATE.  THE CURRENT MILEAGE WAS APPROXIMATELY 82,000 AND FAILURE MILEAGE WAS APPROXIMATELY 80,000.   UPDATED 4/23/09 *CN  UPDATED 04/27/09*JB'

# Description Text Preprocessing

In [15]:
def tokenizer(s):
    return re.findall(r'[a-z]+', s)

def analyzer(s):
    s = s.lower()
    
    tokens = tokenizer(s)
    
    #Remove Stop Words
    tokens = [word for word in tokens if word not in stop]
    
    #Lemmatization & Stemming - Stemming with WordNet POS
    tagged_words = pos_tag(tokens, lang='eng')
    
    stemmed_tokens = []
    for tagged_word in tagged_words:
        term = tagged_word[0]
        pos = tagged_word[1]
        pos = pos[0]
        try:
            pos = wn_tags[pos]
            stemmed_tokens.append(wnl.lemmatize(term, pos=pos))
        except:
            stemmed_tokens.append(stemmer.stem(term))
    
    return list(set(stemmed_tokens))

# LDA

In [16]:
# Setup simple constants
n_docs     = len(df)
n_samples  = n_docs
m_features = 100
s_words    = 'english'
ngram = (1,2)

n_topics        = 8
max_iter        =  5
learning_offset = 20.
learning_method = 'online'

description = df.description.tolist()

In [17]:
tfidf_vect = TfidfVectorizer(max_df=0.95, 
                             min_df=2, 
                             max_features=m_features,
                             analyzer=analyzer, 
                             ngram_range=ngram)

tf_idf = tfidf_vect.fit_transform(description)
print("\nTF_IDF Vectorizer Parameters\n", tfidf_vect, "\n")

lda = LatentDirichletAllocation(n_components=n_topics, 
                                max_iter=max_iter,
                                learning_method=learning_method, 
                                learning_offset=learning_offset, 
                                random_state=12345)

prob_matrix = lda.fit_transform(tf_idf)

print('{:.<22s}{:>6d}'.format("Number of Reviews", tf_idf.shape[0]))
print('{:.<22s}{:>6d}'.format("Number of Terms",     tf_idf.shape[1]))
print("\nTopics Identified using LDA with TF_IDF")
tf_features = tfidf_vect.get_feature_names()
max_words = 15
for topic_idx, topic in enumerate(lda.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([tf_features[i]
                             for i in topic.argsort()[:-max_words - 1:-1]])
        print(message)
        print()

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



TF_IDF Vectorizer Parameters
 TfidfVectorizer(analyzer=<function analyzer at 0x000001280DBD49D8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=0.95,
        max_features=100, min_df=2, ngram_range=(1, 2), norm='l2',
        preprocessor=None, smooth_idf=True, stop_words=None,
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None) 

Number of Reviews.....  2734
Number of Terms.......   100

Topics Identified using LDA with TF_IDF
Topic #0: saturn problem cause light turn steer two power lock car vehicle wheel make also drive

Topic #1: accident car gm recall fuel problem could one part find still time get say issue

Topic #2: ignition key switch start car saturn turn problem lock replace recall would engine get time

Topic #3: front hit side driver brake deploy right leave mph wheel car crash stop air b

In [18]:
X = pd.concat([X, pd.get_dummies(pd.Series(np.argmax(prob_matrix, axis = 1)))], 1)
X['cluster_prob'] = np.max(prob_matrix, axis=1)

In [21]:
pd.Series(np.argmax(prob_matrix, axis=1)).value_counts()

6    829
4    560
5    485
1    328
2    267
3    211
7     36
0     18
dtype: int64

# Train Test Split

In [30]:
X, y = shuffle(X, y, random_state = 12345)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 12345)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1913, 30) (821, 30) (1913,) (821,)


# Logistic Model

In [39]:
params = {'C' : np.logspace(-3, 5, 15)}

log_model = LogisticRegression()

log_cv = GridSearchCV(log_model, param_grid=params, scoring='accuracy', cv = 10)

log_cv.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.00000e-03, 3.72759e-03, 1.38950e-02, 5.17947e-02, 1.93070e-01,
       7.19686e-01, 2.68270e+00, 1.00000e+01, 3.72759e+01, 1.38950e+02,
       5.17947e+02, 1.93070e+03, 7.19686e+03, 2.68270e+04, 1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [40]:
model = LogisticRegression(C=0.0001)
model.fit(X_train, y_train)
print(classification_report(model.predict(X_test), y_test))

             precision    recall  f1-score   support

          0       1.00      0.70      0.82       821
          1       0.00      0.00      0.00         0

avg / total       1.00      0.70      0.82       821



  'recall', 'true', average, warn_for)


In [24]:
log_cv.cv_results_

{'mean_fit_time': array([0.00391254, 0.00221255, 0.00230894, 0.00232465, 0.00241823,
        0.00242822, 0.00201695, 0.00202203, 0.00221887, 0.00242474,
        0.00233266, 0.00222447, 0.00241992, 0.00202694, 0.00212975]),
 'mean_score_time': array([0.0003978 , 0.00029941, 0.00050287, 0.00019789, 0.00019875,
        0.00029576, 0.00049219, 0.00049431, 0.00020204, 0.00039594,
        0.00019634, 0.00019417, 0.00029273, 0.00048862, 0.00029554]),
 'mean_test_score': array([0.71981181, 0.71981181, 0.71981181, 0.71981181, 0.71981181,
        0.71981181, 0.71981181, 0.71981181, 0.71981181, 0.71981181,
        0.71981181, 0.71981181, 0.71981181, 0.71981181, 0.71981181]),
 'mean_train_score': array([0.71981188, 0.71981188, 0.71981188, 0.71981188, 0.71981188,
        0.71981188, 0.71981188, 0.71981188, 0.71981188, 0.71981188,
        0.71981188, 0.71981188, 0.71981188, 0.71981188, 0.71981188]),
 'param_C': masked_array(data=[0.001, 0.003727593720314938, 0.013894954943731374,
                   

In [119]:
pred = log_cv.predict(X_test)

print(classification_report(y_test, pred))

             precision    recall  f1-score   support

          0       0.76      1.00      0.86       623
          1       0.00      0.00      0.00       198

avg / total       0.58      0.76      0.65       821



  'precision', 'predicted', average, warn_for)
