In [1]:
import nltk

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

# set plot style
sns.set()

In [2]:
df = pd.read_csv("train.csv") # loading our dataset from the downloaded csv file
df.head(10)

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954
5,1,Worth a read whether you do or don't believe i...,425577
6,1,RT @thenation: Mike Pence doesn’t believe in g...,294933
7,1,RT @makeandmendlife: Six big things we can ALL...,992717
8,1,@AceofSpadesHQ My 8yo nephew is inconsolable. ...,664510
9,1,RT @paigetweedy: no offense… but like… how do ...,260471


In [3]:
print(df.info()) #checking the datatype of each column in the data
print('\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15819 entries, 0 to 15818
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  15819 non-null  int64 
 1   message    15819 non-null  object
 2   tweetid    15819 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 370.9+ KB
None




In [4]:
print(df.shape) # to know the size of our data (number of rows and columns) 
print('\n')

(15819, 3)




In [5]:
print(df.isnull().sum()) #checking for possible missing values

sentiment    0
message      0
tweetid      0
dtype: int64


In [None]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [None]:
from nltk.corpus import stopwords

In [None]:
stopwords_list = stopwords.words('english')
print(stopwords_list)

In [None]:
#code to be used for later work
#type_labels = list(df.tweetid.unique())
#print(type_labels)

## Text Cleaning

### Removing Noise

This next step effectively removes all websites and replaces them with the text 'web-url'

In [None]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
df['message'] = df['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [None]:
df.head()

### Remove punctuation

First we make all the text lower case to remove some noise from capitalisation.

In [None]:
df['message'] = df['message'].str.lower()

Now let's remove the punctuation using the `string` import.

In [None]:
import string
print(string.punctuation)

In [None]:
def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

In [None]:
df['message'] = df['message'].apply(remove_punctuation)
df['message'].iloc[3]

### Tokenisation

In [None]:
from nltk.tokenize import word_tokenize, TreebankWordTokenizer

using the TreeBankWordTokenizer function.

In [None]:
tokeniser = TreebankWordTokenizer()
df['tokensmessage'] = df['message'].apply(tokeniser.tokenize)

In [None]:
df['tokensmessage'].iloc[3]

### Stemming

In [None]:
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer

In [None]:
# find the stem of each word in words
stemmer = SnowballStemmer('english')
#code or later use
#for word in words.split():
    #print(stemmer.stem(word))

In [None]:
def df_stemmer(words, stemmer):
    return [stemmer.stem(word) for word in words]

In [None]:
df['stem'] = df['tokensmessage'].apply(df_stemmer, args=(stemmer, ))

In [None]:
df['stem'].iloc[3]

Print off the results of the stemmer

In [None]:
for i, t in enumerate(df.iloc[3]['tokensmessage']):    
    print ('{:20s} --> {:10s}'.format(t, df.iloc[3]['stem'][i]))

### Lemmatization


In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

In [None]:
def df_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]    

In [None]:
df['lemma'] = df['tokensmessage'].apply(df_lemma, args=(lemmatizer, ))

In [None]:
for i, t in enumerate(df.iloc[3]['tokensmessage']):    
    print ('{:20s} --> {:10s}'.format(t, df.iloc[3]['lemma'][i]))