In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler


In [2]:
df = pd.read_csv('../collection and cleaning/cleaned files/combined.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head()

Unnamed: 0,text,is_craft
0,badly damaged exterior wall studs is this out ...,0
1,need help removing a screw from the back of a ...,0
2,add second post parallel to existing post i ha...,0
3,rtyuiooopfghjkl,0
4,diy - recycled vinyl record clock,0


In [3]:
X = df['text']
y = df['is_craft']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify = y,
                                                    test_size = 0.15,
                                                    random_state = 42
                                                   )

## Baseline  
____________________

In [6]:
df['is_craft'].value_counts(normalize=True)

1    0.538166
0    0.461834
Name: is_craft, dtype: float64

It appears that so long as the model classifies over 54% correctly, the model will be performing better than the baseline.

# Preprocessing
______________________________________

## Gridsearch for CountVectorizer and TFiDFVectorizer
_____________________________________________

The gridsearches have been modified by the "guess and check" method to be optomized. What is left is the final conclusions of the process. 

In [4]:
pipe_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('logr', LogisticRegression())
])

pipe_params_cvec = {
    'cvec__stop_words': ['english'],
    'cvec__strip_accents': [None],
    'cvec__ngram_range': [(1,2)],
    'cvec__max_df': [0.7],
    'cvec__min_df': [1],
    'cvec__max_features': [9600]
}

In [5]:
gs_cvec = GridSearchCV(pipe_cvec,
                  pipe_params_cvec,
                  cv = 3)

In [6]:
gs_cvec.fit(X_train, y_train);



In [7]:
gs_cvec_best = gs_cvec.best_estimator_
gs_cvec_best

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.7,
                                 max_features=9600, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('logr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                    

In [8]:
gs_cvec.best_params_

{'cvec__max_df': 0.7,
 'cvec__max_features': 9600,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'cvec__strip_accents': None}

In [9]:
gs_cvec_best.score(X_train, y_train)

0.9379662347860228

In [10]:
gs_cvec_best.score(X_test, y_test)

0.8221234018899388

In [11]:
pipe_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logr', LogisticRegression())
])

pipe_params_tfidf = {
    'tfidf__stop_words': ['english'],
    'tfidf__strip_accents': [None],
    'tfidf__ngram_range': [(1,2)],
    'tfidf__max_df': [0.2],
    'tfidf__min_df': [1],
    'tfidf__max_features': [13250]
}

gs_tfidf = GridSearchCV(pipe_tfidf,
                  pipe_params_tfidf,
                  cv = 3)

In [12]:
gs_tfidf.fit(X_train, y_train);



In [13]:
gs_tfidf_best = gs_tfidf.best_estimator_
gs_tfidf.best_params_

{'tfidf__max_df': 0.2,
 'tfidf__max_features': 9000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__stop_words': 'english',
 'tfidf__strip_accents': None}

In [14]:
gs_tfidf_best.score(X_train,y_train)

0.904691794267766

In [15]:
gs_tfidf_best.score(X_test, y_test)

0.8315730961645359

The model seems to be overfit. Attempting the stemmer may help or hurt my predictions.

In [16]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\s+', gaps=True)

In [17]:

for i in range(df.shape[0]):
    text = df['text'][i]
    tokens = tokenizer.tokenize(text)
    stems = [stemmer.stem(token) for token in tokens]
    seperator = ' '
    df['text'][i] = seperator.join(stems)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
df.head(20)

Unnamed: 0,text,is_craft
0,badli damag exterior wall stud is thi out of t...,0
1,need help remov a screw from the back of a tel...,0
2,add second post parallel to exist post i have ...,0
3,rtyuiooopfghjkl,0
4,diy - recycl vinyl record clock,0
5,easi & fast friendship bracelet - simpl friend...,0
6,how to clean & scrub 10 baht coink rama 9 thai...,0
7,ani idea what go on with my basement rot termi...,0
8,need idea for convert rain gutter into planter...,0
9,play with rope,0


In [19]:
X = df['text']
y = df['is_craft']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify = y,
                                                    test_size = 0.15,
                                                    random_state = 42
                                                   )

In [20]:
pipe_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('logr', LogisticRegression())
])

pipe_params_cvec = {
    'cvec__stop_words': ['english'],
    'cvec__strip_accents': [None],
    'cvec__ngram_range': [(1,2)],
    'cvec__max_df': [0.7],
    'cvec__min_df': [1],
    'cvec__max_features': [9600]
}

gs_cvec = GridSearchCV(pipe_cvec,
                  pipe_params_cvec,
                  cv = 3)
gs_cvec.fit(X_train, y_train);

gs_cvec_best = gs_cvec.best_estimator_
gs_cvec_best



Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.7,
                                 max_features=9600, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('logr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                    

In [21]:
print(f'training score is :{gs_cvec_best.score(X_train, y_train)}')
print(f'testing score is :{gs_cvec_best.score(X_test, y_test)}')

training score is :0.9375736160188457
testing score is :0.8321289605336298


In [22]:
pipe_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logr', LogisticRegression())
])

pipe_params_tfidf = {
    'tfidf__stop_words': ['english'],
    'tfidf__strip_accents': [None],
    'tfidf__ngram_range': [(1,2)],
    'tfidf__max_df': [0.7],
    'tfidf__min_df': [1],
    'tfidf__max_features': [13250]
}

gs_tfidf = GridSearchCV(pipe_tfidf,
                  pipe_params_tfidf,
                  cv = 3)

gs_tfidf.fit(X_train, y_train);

gs_tfidf_best = gs_tfidf.best_estimator_
# gs_tfidf.best_params_
print(f'training score is :{gs_tfidf_best.score(X_train, y_train)}')
print(f'testing score is :{gs_tfidf_best.score(X_test, y_test)}')



training score is :0.9078327444051826
testing score is :0.8376876042245692


In [23]:
from sklearn.metrics import confusion_matrix
preds = gs_tfidf_best.predict(X_test)

In [24]:
pd.DataFrame(confusion_matrix(y_test, preds), index=['actual neg', 'actual pos'], columns=['predicted neg', 'actual pos'])

Unnamed: 0,predicted neg,actual pos
actual neg,651,180
actual pos,112,856


### Naive Bayes models
------------

In [25]:
import sklearn.naive_bayes as nb


In [46]:
cvec = CountVectorizer(stop_words='english', ngram_range=(1,2), max_df=.07, max_features=9600)
mnnb = nb.MultinomialNB()
cvec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.07, max_features=9600, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [47]:
mnnb.fit(cvec.transform(X_train).todense(), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [48]:
mnnb.score(cvec.transform(X_train).todense(), y_train)

0.8595406360424028

In [49]:
mnnb.score(cvec.transform(X_test).todense(), y_test)

0.8365758754863813

In [56]:
tfidf =  TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=0.7, max_features=13250)

gausnb = nb.GaussianNB()
tfidf.fit(X_train)

gausnb.fit(tfidf.transform(X_train).todense(), y_train)

gnb_train = gausnb.score(tfidf.transform(X_train).todense(), y_train)
gnb_test = gausnb.score(tfidf.transform(X_test).todense(), y_test)

print(f'Train score: {gnb_train}')
print(f'Test score: {gnb_test}')

GaussianNB(priors=None, var_smoothing=1e-09)

## Decision trees
----------------------

In [64]:
# from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
# rf_test = RandomForestClassifier(n_estimators=100)
# et_test = ExtraTreesClassifier(n_estimators=100)

# tfidf.fit(X_train)

# print(cross_val_score(rf_test, tfidf.transform(X_train).todense(), y_train, cv=5).mean())
# score is 0.813899

# print(cross_val_score(et_test, tfidf.transform(X_train).todense(), y_train, cv=5).mean())
# score is 0.823911

# these trees took almost 20 minutes to calculate and have terrible training scores.

0.8138990501049523
0.8239107425291575


## SVMs
____________________________

In [66]:
from sklearn.svm import SVC

In [83]:
tfidf.fit(X_train)
svc = SVC(kernel='rbf', gamma='scale')
svc.fit(tfidf.transform(X_train).todense(), y_train)
svc.score(tfidf.transform(X_test).todense(), y_test)

0.8532518065591995

In [80]:
# svc = SVC(kernel='rbf', gamma='scale', C=2)
# svc.fit(tfidf.transform(X_train).todense(), y_train)
# svc.score(tfidf.transform(X_test).todense(), y_test)
# score is .843246

0.8432462479155086

In [82]:
# svc = SVC(kernel='rbf', gamma='scale', C=3)
# svc.fit(tfidf.transform(X_train).todense(), y_train)
# svc.score(tfidf.transform(X_test).todense(), y_test)
# score is .8438021

0.8438021122846026

In [84]:
yhat = svc.predict(tfidf.transform(X).todense())
df['best_preds'] = yhat
df.to_csv('best_preds.csv')

In [96]:
df[df['is_craft'] != df['best_preds']].head(15)

Unnamed: 0,text,is_craft,best_preds
5,easi & fast friendship bracelet - simpl friend...,0,1
9,play with rope,0,1
21,simpl & easi paper made wall hang,0,1
24,just love halloween deco,0,1
51,color name with barbi doll,0,1
66,how to make cracker with paper | diy special p...,0,1
83,play doh learn color diy cute shoe cute babi m...,0,1
98,i need to make 5 frame for canvas i bought onl...,0,1
105,arlo netgear login | +1-888-352-3810 | arlo si...,0,1
110,wonder flower of paper basket (5),0,1


In [104]:
len((df[df['is_craft'] != df['best_preds']]['text'])) - len(set((df[df['is_craft'] != df['best_preds']]['text'])))

558

In [112]:
wrong = df[df['is_craft'] != df['best_preds']]
correct = df[df['is_craft'] == df['best_preds']]

double_post = 0
total_posts = 0
for post in set(wrong.text):
    if list(correct['text']).count(post) >= 1:
        double_post += 1
        total_posts += list(correct['text']).count(post)
print(double_post)
print(total_posts)

67

In [116]:
for post in set(wrong.text):
    if list(correct['text']).count(post) >= 1:
        wrong = wrong.drop(wrong[wrong['text'] == post].index)
wrong.to_csv('unaccounted.csv', index=False)