# Data preprocessing

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from scipy.special import logit, expit
import string
import re    #for regex
import nltk
from nltk.data import load
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer  

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\evgen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\evgen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\evgen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
train = pd.read_csv('../data/raw/train.csv').fillna(' ')
test = pd.read_csv('../data/raw/test.csv').fillna(' ')

## Feature extraction

In [4]:
def feature_extraction(data):
    data['count_sent']=data["comment_text"].apply(lambda x: len(re.findall("\n",str(x)))+1)
    #Word count in each comment:
    data['count_word']=data["comment_text"].apply(lambda x: len(str(x).split()))
    #Unique word count
    data['count_unique_word']=data["comment_text"].apply(lambda x: len(set(str(x).split())))
    #Letter count
    data['count_letters']=data["comment_text"].apply(lambda x: len(str(x)))
    #punctuation count
    data["count_punctuations"] =train["comment_text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    #upper case words count
    data["count_words_upper"] = data["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    #title case words count
    data["count_words_title"] = data["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    #Number of stopwords
    eng_stopwords = set(stopwords.words("english"))
    data["count_stopwords"] = data["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
    #Average length of the words
    data["mean_word_len"] = data["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    #Derived features
    data['word_unique_percent']=data['count_unique_word']*100/data['count_word']
    data['punct_percent']=data['count_punctuations']*100/data['count_word']
    data['uppercase_percent']=100*data["count_words_upper"]/data['count_word']
    data['stopwords_perent']=100*data["count_stopwords"]/data['count_word']
    return data

In [5]:
train = feature_extraction(train)
test = feature_extraction(test)

Save the resulting datasets

In [32]:
train.to_csv('../data/preprocessed/train.csv')
test.to_csv('../data/preprocessed/test.csv')

## Cleaning

In [9]:
# THIS FUNCTION GIVES ERROR ON MY COMPUTER
def clean(comment):
    """
    This function receives comments and returns clean word-list
    """
    #Convert to lower case , so that Hi and hi are the same
    comment=comment.lower()
    #remove \n
    comment=re.sub("\\n","",comment)
    #remove \r
    comment=re.sub("\\r","",comment)
    # remove leaky elements like ip,user
    comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    #removing usernames
    comment=re.sub("\[\[.*\]","",comment)
    
    #Split the sentences into words
    tokenizer = load('tokenizers/punkt/{0}.pickle'.format('english'))
    words=tokenizer.tokenize(comment)
    
    # (')aphostophe  replacement (ie)   you're --> you are  
    # ( basic dictionary lookup : master dictionary present in a hidden block of code)
    words=[APPO[word] if word in APPO else word for word in words]
    lem = WordNetLemmatizer()
    words=[lem.lemmatize(word, "v") for word in words]
    eng_stopwords = set(stopwords.words("english"))
    words = [w for w in words if not w in eng_stopwords]

    #clean_sent=" ".join(words)
    # remove any non alphanum,digit character
    #clean_sent=re.sub("\W+"," ",clean_sent)
    #clean_sent=re.sub("  "," ",clean_sent)
    return(comment)