In [1]:
import nltk

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

# set plot style
sns.set()

In [3]:
df = pd.read_csv("train.csv") # loading our dataset from the downloaded csv file
df.head(10)

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954
5,1,Worth a read whether you do or don't believe i...,425577
6,1,RT @thenation: Mike Pence doesn’t believe in g...,294933
7,1,RT @makeandmendlife: Six big things we can ALL...,992717
8,1,@AceofSpadesHQ My 8yo nephew is inconsolable. ...,664510
9,1,RT @paigetweedy: no offense… but like… how do ...,260471


In [4]:
print(df.info()) #checking the datatype of each column in the data
print('\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15819 entries, 0 to 15818
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  15819 non-null  int64 
 1   message    15819 non-null  object
 2   tweetid    15819 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 370.9+ KB
None




In [5]:
print(df.shape) # to know the size of our data (number of rows and columns) 
print('\n')

(15819, 3)




In [6]:
print(df.isnull().sum()) #checking for possible missing values

sentiment    0
message      0
tweetid      0
dtype: int64


In [7]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [8]:
from nltk.corpus import stopwords

In [9]:
stopwords_list = stopwords.words('english')
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [31]:
#code to be used for later work
#type_labels = list(df.tweetid.unique())
#print(type_labels)

## Text Cleaning

### Removing Noise

This next step effectively removes all websites and replaces them with the text 'web-url'

In [19]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
df['message'] = df['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [20]:
df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


### Remove punctuation

First we make all the text lower case to remove some noise from capitalisation.

In [21]:
df['message'] = df['message'].str.lower()

Now let's remove the punctuation using the `string` import.

In [23]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [24]:
def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

In [26]:
df['message'] = df['message'].apply(remove_punctuation)
df['message'].iloc[3]

'todayinmaker wired  2016 was a pivotal year in the war on climate change urlweb'

### Tokenisation

In [27]:
from nltk.tokenize import word_tokenize, TreebankWordTokenizer

using the TreeBankWordTokenizer function.

In [28]:
tokeniser = TreebankWordTokenizer()
df['tokensmessage'] = df['message'].apply(tokeniser.tokenize)

In [29]:
df['tokensmessage'].iloc[3]

['todayinmaker',
 'wired',
 '2016',
 'was',
 'a',
 'pivotal',
 'year',
 'in',
 'the',
 'war',
 'on',
 'climate',
 'change',
 'urlweb']

### Stemming

In [30]:
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer

In [39]:
# find the stem of each word in words
stemmer = SnowballStemmer('english')
#code or later use
#for word in words.split():
    #print(stemmer.stem(word))

In [40]:
def df_stemmer(words, stemmer):
    return [stemmer.stem(word) for word in words]

In [41]:
df['stem'] = df['tokensmessage'].apply(df_stemmer, args=(stemmer, ))

In [42]:
df['stem'].iloc[3]

['todayinmak',
 'wire',
 '2016',
 'was',
 'a',
 'pivot',
 'year',
 'in',
 'the',
 'war',
 'on',
 'climat',
 'chang',
 'urlweb']

Print off the results of the stemmer

In [44]:
for i, t in enumerate(df.iloc[3]['tokensmessage']):    
    print ('{:20s} --> {:10s}'.format(t, df.iloc[3]['stem'][i]))

todayinmaker         --> todayinmak
wired                --> wire      
2016                 --> 2016      
was                  --> was       
a                    --> a         
pivotal              --> pivot     
year                 --> year      
in                   --> in        
the                  --> the       
war                  --> war       
on                   --> on        
climate              --> climat    
change               --> chang     
urlweb               --> urlweb    


### Lemmatization


In [45]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [46]:
def df_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]    

In [47]:
df['lemma'] = df['tokensmessage'].apply(df_lemma, args=(lemmatizer, ))

In [48]:
for i, t in enumerate(df.iloc[3]['tokensmessage']):    
    print ('{:20s} --> {:10s}'.format(t, df.iloc[3]['lemma'][i]))

todayinmaker         --> todayinmaker
wired                --> wired     
2016                 --> 2016      
was                  --> wa        
a                    --> a         
pivotal              --> pivotal   
year                 --> year      
in                   --> in        
the                  --> the       
war                  --> war       
on                   --> on        
climate              --> climate   
change               --> change    
urlweb               --> urlweb    
