###### Cory Melendez
###### Natural Language Processing Project
###### https://github.com/cmelende/NLPProject.git
###### 12/11/20


###### 1. Import libraries, load dataset print shape of data, data description

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
import re
import spacy
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup

In [24]:
data_source = './data/Tweets.csv'
nltk.download('stopwords')
english_stopwords = stopwords.words("english")
df = pd.read_csv(data_source)
print("Shape of Data is: ", df.shape)
print("Description of data is: /n", df.describe())


Shape of Data is:  (14640, 15)
Description of data is: /n            tweet_id  airline_sentiment_confidence  negativereason_confidence  \
count  1.464000e+04                  14640.000000               10522.000000   
mean   5.692184e+17                      0.900169                   0.638298   
std    7.791112e+14                      0.162830                   0.330440   
min    5.675883e+17                      0.335000                   0.000000   
25%    5.685592e+17                      0.692300                   0.360600   
50%    5.694779e+17                      1.000000                   0.670600   
75%    5.698905e+17                      1.000000                   1.000000   
max    5.703106e+17                      1.000000                   1.000000   

       retweet_count  
count   14640.000000  
mean        0.082650  
std         0.745778  
min         0.000000  
25%         0.000000  
50%         0.000000  
75%         0.000000  
max        44.000000  


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Corym\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
tweet_col = 'tweet_id'
airline_sentiment = 'airline_sentiment'
airline_sentiment_confidence = 'airline_sentiment_confidence'
negative_reason = 'negativereason'
negative_reason_confidence = 'negativereason_confidence'
airline = 'airline'
airline_sentiment_gold = 'airline_sentiment_gold'
name = 'name'
negative_reason_gold = 'negativereason_gold'
retweet_count = 'retweet_count'
text = 'text'
tweet_coord = 'tweet_coord'
tweet_created = 'tweet_created'
tweet_location = 'tweet_location'
user_timezone = 'user_timezone'
all_cols = [tweet_col, airline_sentiment, airline_sentiment_confidence,
            negative_reason, negative_reason_confidence, airline,
            airline_sentiment_gold, name, negative_reason_gold,
            retweet_count, text, tweet_coord, tweet_created,
            tweet_location, user_timezone]

##### 2. Understand the data columns

###### a. Drop all other columns except 'text' and 'airline_sentiment'

In [26]:
def remove_columns(df, keep_columns, all_columns):
    copy = pd.DataFrame()
    for col in all_columns:
        if col in keep_columns:
            copy[col] = df[col]

    return copy

###### b. Check the shape of the data

In [27]:
trimmed_df = remove_columns(df, [text, airline_sentiment], all_cols)
print("shape: ", trimmed_df.shape)

shape:  (14640, 2)


###### c. Print the first 5 rows

In [28]:
print("first 5 rows")
print(trimmed_df.head(5))

first 5 rows
  airline_sentiment                                               text
0           neutral                @VirginAmerica What @dhepburn said.
1          positive  @VirginAmerica plus you've added commercials t...
2           neutral  @VirginAmerica I didn't today... Must mean I n...
3          negative  @VirginAmerica it's really aggressive to blast...
4          negative  @VirginAmerica and it's a really big bad thing...


##### 3. Text Pre-processing: Data Preparation

###### note: doing all of the above in cell below
###### a. Html tag removal
###### c. Remove the numbers
###### d. remove special characters and punctuations
###### e. conversion to lowercase
###### g. join the words in the list to convert back to text string in the dataframe (So that each row contains the data in text format)

In [29]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")    # Removing HTML tags
    return soup.get_text()

def remove_special_characters_numbers(text):
    pattern = r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

def to_lowercase(text):
    text = text.lower()
    return text

def clean_text(text):
    text = strip_html(text)
    text = remove_special_characters_numbers(text)
    text = to_lowercase(text)
    return text

cleaned_df = trimmed_df.copy()
cleaned_df[text] = trimmed_df[text].apply(clean_text)

###### b. Tokenization

In [30]:
def tokenize(text):
    tokenizer=ToktokTokenizer()
    tokens=tokenizer.tokenize(text)
    return tokens

def remove_stopwords(text):
    token_array = tokenize(text)
    words = [w for w in token_array if not w in english_stopwords]
    reassembled_string = reassemble_token_array(words)
    return reassembled_string

def reassemble_token_array(token_array):
    space = ' '
    reassembled_string = space.join(token_array)
    return reassembled_string

cleaned_df[text] = cleaned_df[text].apply(remove_stopwords)

###### f. lemmatatize or stemming


In [31]:
# for some reason, this is the only way i could get stemming to work, I couldnt use the regular method that was given in the lecture b/c it kept throwing an error like it couldnt find it after loadin
import en_core_web_sm
nlp = en_core_web_sm.load()
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

cleaned_df[text] = cleaned_df[text].apply(lemmatize_text)

###### h. print first 5 rows of data after pre-processing

In [32]:
stemmed_df = cleaned_df
stemmed_df.head(5)

Unnamed: 0,airline_sentiment,text
0,neutral,virginamerica dhepburn say
1,positive,virginamerica plus you have add commercial exp...
2,neutral,virginamerica do not today must mean need take...
3,negative,virginamerica really aggressive blast obnoxiou...
4,negative,virginamerica really big bad thing


##### 5.

###### a. Use CountVectorizer

In [33]:
def get_all_tweets(df, text_col):
    tweet_array = []
    for i in range( 0, df[text_col].size ):
        tweet_array.append(df[text_col][i])

    return tweet_array

all_tweets = get_all_tweets(stemmed_df, text)

['virginamerica dhepburn say',
 'virginamerica plus you have add commercial experience tacky',
 'virginamerica do not today must mean need take another trip',
 'virginamerica really aggressive blast obnoxious entertainment guest face little recourse',
 'virginamerica really big bad thing',
 'virginamerica seriously would pay flight seat do not play really bad thing fly va',
 'virginamerica yes nearly every time fly vx ear worm will not go away',
 'virginamerica really miss prime opportunity man without hats parody httpstcomwpggrezp',
 'virginamerica well didntbut',
 'virginamerica amazing arrived hour early you be good',
 'virginamerica know suicide second lead cause death among teen',
 'virginamerica pretty graphic much well minimal iconography',
 'virginamerica great deal already think nd trip australia have not even go st trip yet p',
 'virginamerica virginmedia i be fly fabulous seductive sky u take stress away travel httptcoahlxhhkiyn',
 'virginamerica thanks',
 'virginamerica sfo

In [39]:
count_vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 1000)
count_train_data_features = count_vectorizer.fit_transform(all_tweets)
count_train_data_features = count_train_data_features.toarray()
count_train_data_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [40]:
labels = stemmed_df[airline_sentiment]

###### b. Use TfidVectorizer

In [42]:
tfid_vectorizer = TfidfVectorizer(max_features=1000)
tfid_data_features = tfid_vectorizer.fit_transform(stemmed_df[text])
tfid_data_features = tfid_data_features.toarray()
tfid_data_features.shape

(14640, 1000)

##### 5. build and eval models

###### Count Vectorizer model

In [43]:
forest = RandomForestClassifier(verbose=2,n_jobs=-1,n_estimators = 100)
forest = forest.fit( count_train_data_features, labels )
print (forest)
print (np.mean(cross_val_score(forest,count_train_data_features,labels,cv=10)))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    7.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs

building tree 1 of 100building tree 2 of 100building tree 3 of 100

building tree 4 of 100
building tree 5 of 100

building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

###### Tfid Vectorizer model

In [44]:
forest = RandomForestClassifier(verbose=2,n_jobs=-1,n_estimators = 100)
forest = forest.fit( tfid_data_features, labels )
print (forest)
print (np.mean(cross_val_score(forest,tfid_data_features,labels,cv=10)))

building tree 1 of 100building tree 2 of 100building tree 3 of 100
building tree 4 of 100building tree 5 of 100building tree 6 of 100

building tree 7 of 100
building tree 8 of 100



building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs

##### 6. Summarize your understanding of the application of various pre-processing and vectorization and performance of your model on this dataset
kinda hard to see, but we have scores:

count: 0.7204918032786886

0.7165300546448088

Before running this, i expected to the count to perform better, but i didnt expect it to be this close. I had thought since we were measuring the sentiment and of the
tweets, that counting the number of words in a tweet seems to be a more natural way of measuring whether or not that tweet has a positive, neutral or negative sentiment.