In [1]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
data = pd.read_csv("tsa_train.csv")
new = pd.read_csv("tsa_test.csv")
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [2]:
data.shape

(2077, 3)

In [3]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
data['label'].value_counts()

0    1928
1     149
Name: label, dtype: int64

# Preprocessing Data 

In [5]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer() #

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['tweet'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['tweet'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

# Split into train-test

In [6]:
from sklearn.model_selection import train_test_split
X=data[['tweet', 'body_len', 'punct%']]
y=data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)

In [7]:
from sklearn.model_selection import train_test_split
X=data[['tweet', 'body_len', 'punct%']]
y=data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)

# Vectorise Text

In [8]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['tweet'])

tfidf_train = tfidf_vect_fit.transform(X_train['tweet'])
tfidf_test = tfidf_vect_fit.transform(X_test['tweet'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,5117,5118,5119,5120,5121,5122,5123,5124,5125,5126
0,51,5.9,0.109255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.762025,0.0
1,58,5.2,0.083327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,61,9.8,0.063664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,48,14.6,0.06374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31,9.7,0.110597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Final Evaluation of models

In [9]:
from sklearn.neural_network import MLPClassifier

In [10]:
clf = MLPClassifier(hidden_layer_sizes = (10,10,10), max_iter = 300)

In [11]:
ann_model = clf.fit(X_train_vect, y_train)

In [12]:
y_pred = clf.predict(X_test_vect)

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import accuracy_score, classification_report

In [14]:
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

Confusion Matrix: 
 [[371  10]
 [ 23  12]]


In [15]:
print('Accuracy: ', accuracy_score(y_test, y_pred)*100)

Accuracy:  92.0673076923077


In [16]:
print('\n Artificial Neural Network Classification Report:\n', classification_report(y_test, y_pred))


 Artificial Neural Network Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96       381
           1       0.55      0.34      0.42        35

    accuracy                           0.92       416
   macro avg       0.74      0.66      0.69       416
weighted avg       0.91      0.92      0.91       416



# Test

In [17]:
data = pd.read_csv("tsa_test.csv")

In [18]:
new = data.head(25)
new

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."
5,31968,choose to be :) #momtips
6,31969,something inside me dies ð¦ð¿â¨ eyes nes...
7,31970,#finished#tattoo#inked#ink#loveitâ¤ï¸ #â¤ï¸...
8,31971,@user @user @user i will never understand why...
9,31972,#delicious #food #lovelife #capetown mannaep...


In [19]:
new['body_len'] = new['tweet'].apply(lambda x: len(x) - x.count(" "))
new['punct%'] = new['tweet'].apply(lambda x: count_punct(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['body_len'] = new['tweet'].apply(lambda x: len(x) - x.count(" "))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['punct%'] = new['tweet'].apply(lambda x: count_punct(x))


In [20]:
new_vect = tfidf_vect_fit.transform(new['tweet'])
new_vect

<25x5127 sparse matrix of type '<class 'numpy.float64'>'
	with 153 stored elements in Compressed Sparse Row format>

In [21]:
sample_vect = pd.concat([new[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(new_vect.toarray())], axis=1)
sample_vect

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,5117,5118,5119,5120,5121,5122,5123,5124,5125,5126
0,79,8.9,0.12839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,82,6.1,0.15269,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,57,14.0,0.065474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,119,6.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38786,0.0
4,76,5.3,0.210483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,20,15.0,0.150077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,78,6.4,0.060678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282143,0.0
7,81,13.6,0.041561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.872021,0.0,0.0,0.0
8,86,12.8,0.15379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,58,8.6,0.097838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
ann_model.predict(sample_vect)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0], dtype=int64)