# IFT6390 Project - Preprocessing

## Setup - data

In [1]:
import pandas as pd
import numpy as np

### Load CSV

In [2]:
s140 = pd.read_csv('data/sentiment140.csv')
cc = pd.read_csv('data/climatechange.csv')
mr = pd.read_csv('data/moviereview.csv')

### Create pickle

In [3]:
s140.to_pickle('data/s140.pkl')
cc.to_pickle('data/cc.pkl')
mr.to_pickle('data/mr.pkl')

### Load Pickle

In [11]:
s140 = pd.read_pickle('data/s140.pkl')
cc = pd.read_pickle('data/cc.pkl')
mr = pd.read_pickle('data/mr.pkl')

## Transform

In [4]:
#Natural Language Toolkit
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize #creates arrays of words
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

#identifies words which are not adding semantic value to the sentence
stopw = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/rd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/rd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/rd/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
import re
from re import sub
import string
from sklearn.feature_extraction.text import CountVectorizer


class clean:
    def url(df:pd.DataFrame) -> pd.DataFrame:
        # SRC -> https://stackoverflow.com/questions/51994254/removing-url-from-a-column-in-pandas-dataframe
        return df.str.replace('http\S+|www.\S+', '_link_', case=False)
    
    def url(s:str) -> str:
        return sub('http\S+|www.\S+', '_link_', s, flags=re.IGNORECASE)
    
    def rm_repeat(w:str) -> str:
        """removes letters repeated 3+ times"""
        return sub(r'(.)\1{2,}', r'\1', w)
    
    #create an array of the words contained in each comment _
    #while removing strings representing numbers and stopwords
    
    def merge(v:np.ndarray) -> str:
        return "".join(w+" " for w in v) #converts vector into string
    
    def convert(s:str) -> str:
        """Returns a vector representation of the sentence"""
        s=clean.url(s)
        s=clean.rm_repeat(s)
        #v=[w for w in word_tokenize(s) if (w not in stopw and len(w) > 1)] # create words
        v=word_tokenize(s)
        return "".join(clean.to_ascii(w)+" " for w in v) #converts vector into string
    
    def to_ascii(w:str) -> str:
        """Keeps ascii-only characters and appends tokens representing 
        strings of digits and non-ascii characters"""
        onlyascii= "".join(i for i in w.lower() if (ord(i) < 48 or (ord(i)> 57 and ord(i)<128)))        
        return onlyascii + clean.notascii(w) + clean.onlynumber(w)
    
    def onlynumber(s:str) -> str:
        """Returns a '_number_' token to represent any string of digits"""
        n="".join(i for i in s if (ord(i) >= 48 and ord(i)<= 57 ))
        if (n !=""):
            return " _number_"
        else:
            return ""

    def notascii(s:str) -> str:
        """Map strings of non-ascii characters to '_notascii_' token"""
        symbol= "".join(i for i in s if ord(i) >= 128)
        if (symbol !=""):
            return " _notascii_"
        else:
            return ""
    
    
    #SRC -> https://www.geeksforgeeks.org/implement-isnumber-function-in-python/
    def isNumber(s): 
        """Considers strings of digits headed with sign characters"""

        # handle for signed values
        negative = False
        if(s[0] =='-' or s[0] =='+'): 
            sign = True

        if sign == True: 
            return clean.isNumber(s[1:]) #handles repeated signs recursively
        else:
            return s.isdigit()    

    def normalize_text(s:str) -> str:
        """removes stop words, words of size 1, symbols and numbers"""
        return [w for w in word_tokenize(clean.convert(s)) if (w not in stopw and len(w) > 1)]

    def lemmatize(s:str) -> str:
        wnl = WordNetLemmatizer()
        return [wnl.lemmatize(w) for w in clean.normalize_text(s)]

    def stem(s:str) -> str:
        ps  = PorterStemmer()
        return [ps.stem(w) for w in clean.normalize_text(s)]
    

## S140

In [6]:
s140['lemma'] = s140['text'].apply(clean.lemmatize)

In [7]:
s140['length']=s140['lemma'].apply(lambda x: len(x))

In [8]:
s140['trimmed']=s140['lemma'].apply(clean.merge)

## cc

In [9]:
cc['lemma'] = cc['text'].apply(clean.lemmatize)

In [10]:
cc['length']=cc['lemma'].apply(lambda x: len(x))

In [11]:
cc['trimmed']=cc['lemma'].apply(clean.merge)

In [15]:
cc['old_target']=cc.target
cc=cc.replace({'target': {'Yes': 'positive', 'Y': 'positive', 'No': 'negative', 'N': 'negative'}})

## mr

In [12]:
mr['lemma'] = mr['text'].apply(clean.lemmatize)

In [13]:
mr['length']=mr['lemma'].apply(lambda x: len(x))

In [14]:
mr['trimmed']=mr['lemma'].apply(clean.merge)

## Produce cleaned dataframe for future usage

In [16]:
s140.to_pickle('data/s140_clean_28nov.pkl')
cc.to_pickle('data/cc_clean_28nov.pkl')
mr.to_pickle('data/mr_clean_28nov.pkl')


In [4]:
s140=pd.read_pickle('data/s140_clean_28nov.pkl')

In [103]:
print(s140['length'].describe().astype('int64') )
print('='*80)
print(s140.head())

count    1600498
mean           8
std            4
min            0
25%            5
50%            8
75%           11
max          118
Name: length, dtype: int64
                                                text    target  \
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  negative   
1  Got a headache :/ MC stop making music, you ca...  negative   
2  lol still worked like crazy lol  . lol Your la...  negative   
3  why won't netflix send me S. Darko? I know it'...  negative   
4  [ToZ] Clan Website offline  http://www.theoutl...  negative   

                                               lemma  length  
0  [switchfoot, _link_, awww, 's, bummer, shoulda...      11  
1  [got, headache, mc, stop, making, music, ca, n...      11  
2  [lol, still, worked, like, crazy, lol, lol, la...      18  
3  [wo, n't, netflix, send, s., darko, know, 's, ...      16  
4              [toz, clan, website, offline, _link_]       5  
