In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [2]:
tweets = pd.read_csv(r'C:\Users\Dorris\Downloads\Tweets.csv')

In [3]:
tweets.drop(tweets.columns.difference(['text','airline']), axis = 1,inplace = True)

In [4]:
tweets.shape

(14640, 2)

In [5]:
tweets.head(5)

Unnamed: 0,airline,text
0,Virgin America,@VirginAmerica What @dhepburn said.
1,Virgin America,@VirginAmerica plus you've added commercials t...
2,Virgin America,@VirginAmerica I didn't today... Must mean I n...
3,Virgin America,@VirginAmerica it's really aggressive to blast...
4,Virgin America,@VirginAmerica and it's a really big bad thing...


In [6]:
#removing HTML Tags
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(tweets):
    return TAG_RE.sub('',tweets)

In [7]:
tweets.head(5)

Unnamed: 0,airline,text
0,Virgin America,@VirginAmerica What @dhepburn said.
1,Virgin America,@VirginAmerica plus you've added commercials t...
2,Virgin America,@VirginAmerica I didn't today... Must mean I n...
3,Virgin America,@VirginAmerica it's really aggressive to blast...
4,Virgin America,@VirginAmerica and it's a really big bad thing...


In [8]:
#tokenization
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
tokens = tokenizer.tokenize(tweets)
print(tokens)

['airline', 'text', '0', 'Virgin', 'America', '@VirginAmerica', 'What', '@dhepburn', 'said.', '1', 'Virgin', 'America', '@VirginAmerica', 'plus', 'you', "'", 've', 'added', 'commercials', 't', '...', '2', 'Virgin', 'America', '@VirginAmerica', 'I', 'didn', "'", 't', 'today', '...', 'Must', 'mean', 'I', 'n', '...', '3', 'Virgin', 'America', '@VirginAmerica', 'it', "'", 's', 'really', 'aggressive', 'to', 'blast', '...', '4', 'Virgin', 'America', '@VirginAmerica', 'and', 'it', "'", 's', 'a', 'really', 'big', 'bad', 'thing', '...', '...', '...', '...', '14635', 'American', '@AmericanAir', 'thank', 'you', 'we', 'got', 'on', 'a', 'different', 'f', '...', '14636', 'American', '@AmericanAir', 'leaving', 'over', '20', 'minutes', 'Late', 'Flig', '...', '14637', 'American', '@AmericanAir', 'Please', 'bring', 'American', 'Airlines', 'to', '...', '14638', 'American', '@AmericanAir', 'you', 'have', 'my', 'money', ',', 'you', 'change', 'my', '...', '14639', 'American', '@AmericanAir', 'we', 'have', '

In [9]:
#removing numbers
tweets.replace('\d+', '', regex=True , inplace=True)
print(tweets)

              airline                                               text
0      Virgin America                @VirginAmerica What @dhepburn said.
1      Virgin America  @VirginAmerica plus you've added commercials t...
2      Virgin America  @VirginAmerica I didn't today... Must mean I n...
3      Virgin America  @VirginAmerica it's really aggressive to blast...
4      Virgin America  @VirginAmerica and it's a really big bad thing...
...               ...                                                ...
14635        American  @AmericanAir thank you we got on a different f...
14636        American  @AmericanAir leaving over  minutes Late Flight...
14637        American  @AmericanAir Please bring American Airlines to...
14638        American  @AmericanAir you have my money, you change my ...
14639        American  @AmericanAir we have  ppl so we need  know how...

[14640 rows x 2 columns]


In [10]:
#Removing special characters and punctuations
df = pd.DataFrame(tweets)
spec_chars = ["!",'"',"#","%","&","'","(",")","*","+",",","-",".","/",":",";","<","=",">","?","@","[","\\","]","^","_","{","}","|"]

for char in spec_chars:
    df['text'] = df['text'].str.replace(char,' ')

In [11]:
tweets.head(5)

Unnamed: 0,airline,text
0,Virgin America,VirginAmerica What dhepburn said
1,Virgin America,VirginAmerica plus you ve added commercials t...
2,Virgin America,VirginAmerica I didn t today Must mean I n...
3,Virgin America,VirginAmerica it s really aggressive to blast...
4,Virgin America,VirginAmerica and it s a really big bad thing...


In [12]:
#converting to lowercase
tweets['text'] = df['text'].str.lower()
print(tweets)

              airline                                               text
0      Virgin America                 virginamerica what  dhepburn said 
1      Virgin America   virginamerica plus you ve added commercials t...
2      Virgin America   virginamerica i didn t today    must mean i n...
3      Virgin America   virginamerica it s really aggressive to blast...
4      Virgin America   virginamerica and it s a really big bad thing...
...               ...                                                ...
14635        American   americanair thank you we got on a different f...
14636        American   americanair leaving over  minutes late flight...
14637        American   americanair please bring american airlines to...
14638        American   americanair you have my money  you change my ...
14639        American   americanair we have  ppl so we need  know how...

[14640 rows x 2 columns]


In [13]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

simple_stemmer("text")

'text'

In [15]:
#lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in tweets]

tweets['text'].apply(lemmatize_text)
print(tweets)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dorris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


              airline                                               text
0      Virgin America                 virginamerica what  dhepburn said 
1      Virgin America   virginamerica plus you ve added commercials t...
2      Virgin America   virginamerica i didn t today    must mean i n...
3      Virgin America   virginamerica it s really aggressive to blast...
4      Virgin America   virginamerica and it s a really big bad thing...
...               ...                                                ...
14635        American   americanair thank you we got on a different f...
14636        American   americanair leaving over  minutes late flight...
14637        American   americanair please bring american airlines to...
14638        American   americanair you have my money  you change my ...
14639        American   americanair we have  ppl so we need  know how...

[14640 rows x 2 columns]


In [16]:
tweets['text'] = tweets['text'].apply(lambda x: ''.join(x))

In [17]:
tweets.head()

Unnamed: 0,airline,text
0,Virgin America,virginamerica what dhepburn said
1,Virgin America,virginamerica plus you ve added commercials t...
2,Virgin America,virginamerica i didn t today must mean i n...
3,Virgin America,virginamerica it s really aggressive to blast...
4,Virgin America,virginamerica and it s a really big bad thing...


In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from lightgbm import LGBMClassifier
from sklearn.pipeline import make_pipeline
from warnings import filterwarnings
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline

In [19]:
twitter_sentiment = Pipeline([('CVec', CountVectorizer(CountVectorizer(stop_words='english'))),
                     ('Tfidf', TfidfTransformer()),
                      ('norm', Normalizer()),
                    ('tSVD', TruncatedSVD(n_components=100)),
                     ('lgb', LGBMClassifier(n_jobs=-1))])



In [20]:
%%time
cv_pred = cross_validate(twitter_sentiment, 
                             tweets['text'], 
                             tweets['airline'], 
                             cv=5,
                             scoring=('roc_auc_ovr'))

Wall time: 41.7 s


In [21]:
sorted(cv_pred.keys())

['fit_time', 'score_time', 'test_score']

In [22]:
cv_pred['test_score']

array([0.99973666, 0.99971935, 0.99955616, 0.99980692, 0.99986504])

In [None]:
#Summary
#Text preprocessing and vectorization helps transform the tweets data into a more digestible form so that machine learning algorithms can perform better.