# Importing Libraries and Data

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
data = pd.read_csv("tsa_train.csv")
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [2]:
data.shape

(2077, 3)

In [3]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
data['label'].value_counts()

0    1928
1     149
Name: label, dtype: int64

# Preprocessing Data 

In [5]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer() #

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['tweet'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['tweet'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

# Split into train-test

In [6]:
from sklearn.model_selection import train_test_split
X=data[['tweet', 'body_len', 'punct%']]
y=data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)

In [7]:
X_test.head()

Unnamed: 0,tweet,body_len,punct%
1694,@user @user looking forward to seeing the new...,51,3.9
930,#friday xoxosamantha via,22,4.5
1301,yes!! waiting to see what's in store for ed an...,92,7.6
1974,ð #love #instagood #photooftheday top.tags ...,82,12.2
145,happiest place on eah ð« #disneysmagickingdo...,84,6.0


# Vectorise Text

In [8]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['tweet'])

tfidf_train = tfidf_vect_fit.transform(X_train['tweet'])
tfidf_test = tfidf_vect_fit.transform(X_test['tweet'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,5117,5118,5119,5120,5121,5122,5123,5124,5125,5126
0,51,5.9,0.109255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.762025,0.0
1,58,5.2,0.083327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,61,9.8,0.063664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,48,14.6,0.06374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31,9.7,0.110597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Final Evaluation of models 

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [10]:
rf = RandomForestClassifier(n_estimators=150, random_state=0)
rf_model = rf.fit(X_train_vect, y_train)
y_pred = rf_model.predict(X_test_vect)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[381   0]
 [ 31   4]]


In [11]:
accuracy_score(y_test, y_pred) * 100

92.54807692307693

In [13]:
print('\nRandom Forest Classification Report:\n', classification_report(y_test, y_pred))


Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96       381
           1       1.00      0.11      0.21        35

    accuracy                           0.93       416
   macro avg       0.96      0.56      0.58       416
weighted avg       0.93      0.93      0.90       416



# Test

In [18]:
data = pd.read_csv("tsa_test.csv", names=['tweet'])

In [26]:
new = data.head(200)
new

Unnamed: 0,tweet
id,tweet
31963,#studiolife #aislife #requires #passion #dedic...
31964,@user #white #supremacists want everyone to s...
31965,safe ways to heal your #acne!! #altwaystohe...
31966,is the hp and the cursed child book up for res...
...,...
32157,i messed up my nails
32158,can #lighttherapy help with or #depression? ...
32159,aww yeah it's all good bing bong bing bong
32160,selfie srelfie! #smile #me #selfie #followme...


In [27]:
new['body_len'] = new['tweet'].apply(lambda x: len(x) - x.count(" "))
new['punct%'] = new['tweet'].apply(lambda x: count_punct(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['body_len'] = new['tweet'].apply(lambda x: len(x) - x.count(" "))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['punct%'] = new['tweet'].apply(lambda x: count_punct(x))


In [28]:
new_vect = tfidf_vect_fit.transform(new['tweet'])
new_vect

<200x5127 sparse matrix of type '<class 'numpy.float64'>'
	with 1413 stored elements in Compressed Sparse Row format>

In [29]:
new

Unnamed: 0,tweet,body_len,punct%
id,tweet,5,0.0
31963,#studiolife #aislife #requires #passion #dedic...,79,8.9
31964,@user #white #supremacists want everyone to s...,82,6.1
31965,safe ways to heal your #acne!! #altwaystohe...,57,14.0
31966,is the hp and the cursed child book up for res...,119,6.7
...,...,...,...
32157,i messed up my nails,16,0.0
32158,can #lighttherapy help with or #depression? ...,70,11.4
32159,aww yeah it's all good bing bong bing bong,34,2.9
32160,selfie srelfie! #smile #me #selfie #followme...,71,11.3


In [30]:
sample_vect = pd.concat([new[['body_len', 'punct%']].reset_index(drop=True),
pd.DataFrame(new_vect.toarray())], axis=1)

In [31]:
sample_vect

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,5117,5118,5119,5120,5121,5122,5123,5124,5125,5126
0,5,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0
1,79,8.9,0.128390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0
2,82,6.1,0.152690,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0
3,57,14.0,0.065474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0
4,119,6.7,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38786,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,16,0.0,0.150077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0
196,70,11.4,0.083293,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0
197,34,2.9,0.058636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0
198,71,11.3,0.081532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0


In [32]:
rf_model.predict(sample_vect)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0], dtype=int64)