# Pre processing - Standardization

In [0]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [0]:
!gcloud auth login

Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?code_challenge=lCp_UjAg4OkTR0gHZgAg_n09K-Na6rWKxAGNaaVfndo&prompt=select_account&code_challenge_method=S256&access_type=offline&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&response_type=code&client_id=32555940559.apps.googleusercontent.com&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth


Enter verification code: 4/tQHg09_vfm6yYHM1WlCHSUm-y7xLbp4sB4adSmEF3N3e5LzHTjhVNCk
If you need to use ADC, see:
  gcloud auth application-default --help

You are now logged in as [galli.giuly@gmail.com].
Your current project is [None].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


In [0]:
!gcloud config set project reddit-master

Updated property [core/project].


In [0]:
import pandas as pd
import string
import spacy #load spacy

from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn import preprocessing
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split




In [0]:
!gsutil cp gs://reddit_final_results/comments_posts_2018_V2.csv .

Copying gs://reddit_final_results/comments_posts_2018_V2.csv...
/ [1 files][  2.0 GiB/  2.0 GiB]   71.2 MiB/s                                   
Operation completed over 1 objects/2.0 GiB.                                      


In [0]:
comments_posts_df = pd.read_csv("comments_posts_2018_V2.csv")

In [0]:
comments_posts_df.columns

Index(['Unnamed: 0', 'subreddit', 'body'], dtype='object')

Checking if there are NaN values and delete them

In [0]:
comments_posts_df["body"].isna().sum()

32288

In [0]:
comments_posts_df = comments_posts_df.dropna()

## Transformation of the prediction target

In [0]:
# Create a column named subreddit_id 
comments_posts_df['subreddit_id'] = comments_posts_df['subreddit']


In [0]:
comments_posts_df.head(5)

Unnamed: 0.1,Unnamed: 0,subreddit,body,subreddit_id
0,0,aww,doubling down with the multiple sub approach g...,aww
1,1,aww,i 2nd meatball i have a seriously chunky engli...,aww
2,2,aww,my thoughts as well,aww
3,3,aww,friends don't eat friends,aww
4,4,aww,maybe the doctors are waiting with the heart i...,aww


In [0]:
# Create a label (category) encoder object
le = preprocessing.LabelEncoder()

In [0]:
# Fit the encoder to the pandas column
le.fit(comments_posts_df['subreddit'])

LabelEncoder()

In [0]:
# View the labels (if you want)
list(le.classes_)

['Fitness',
 'IAmA',
 'atheism',
 'aww',
 'europe',
 'funny',
 'gaming',
 'movies',
 'nba',
 'politics',
 'science',
 'technology',
 'todayilearned',
 'worldnews']

In [0]:
# Apply the fitted encoder to the pandas column
comments_posts_df['subreddit_id'] = le.transform(comments_posts_df['subreddit']) 

In [0]:
comments_posts_df.head(5)

Unnamed: 0.1,Unnamed: 0,subreddit,body,subreddit_id
0,0,aww,doubling down with the multiple sub approach g...,3
1,1,aww,i 2nd meatball i have a seriously chunky engli...,3
2,2,aww,my thoughts as well,3
3,3,aww,friends don't eat friends,3
4,4,aww,maybe the doctors are waiting with the heart i...,3


## Tokenization - stopwords - stemming

In [0]:
nlp = spacy.load("en", disable=['parser', 'ner'])
stemmer = SnowballStemmer(language='english')

In [0]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stopwords: %d' % len(stop_words))
print(list(stop_words))

Number of stopwords: 326
['’ve', 'nobody', 'hereafter', 'though', 'becomes', 'serious', 'their', 'else', 'somehow', 'alone', 'am', 'seeming', 'used', 'fifty', 'amount', 'using', 'we', "'d", 'not', 'did', 'same', 'besides', 'hereby', 'what', 'empty', 'among', 'mostly', 'with', 'from', 'on', 'less', 'through', 'has', 'unless', 'then', '‘s', 'front', 'seemed', 'also', 'there', 'three', 'never', 'whole', 'around', 'was', 'anyone', 'rather', 'once', 'thru', 'via', 'which', 'forty', "'ve", 'seems', 'ca', '’re', 'another', 'under', 'until', 'beyond', 'wherein', 'against', 'yet', "'ll", 'between', 'hereupon', 'call', 'whose', 'all', 'here', "'re", 'wherever', 'since', 'back', 'twelve', 'other', 'move', 'them', 'much', 'whoever', 'may', 'could', 'four', 'nevertheless', 'ours', 'out', 'already', 'see', 'you', 'below', 'go', 'me', 'towards', 'such', 'well', 'done', 'n’t', 'become', 'he', 'top', 'her', 'nor', 'one', '‘ve', 'your', 'whatever', 'along', 'first', 'within', 'six', 'noone', 'in', 'ther

In [0]:
parser = English()
stemmer = SnowballStemmer(language='english')

# tokenization
def tokenize(text):
    return parser(text)

# stopwords
def remove_stopwords(tokenized):
    without_stopwords = []

    for token in tokenized:
        if token.text not in stop_words:
            without_stopwords.append(token.text)

    return without_stopwords

# stemming
def stem(words):
    stemmed_words = []
    for word in words:
        stemmed_words.append(stemmer.stem(word))
    return stemmed_words


def pre_process (text):
    return stem(remove_stopwords(tokenize(text)))

def pre_process_df (df,column):
    df[column] = df[column].map(pre_process)
    return 

In [0]:
# testing the pre_process function on a simple string
sample = "tried try trying to tokenize our text doesn't"
pre_process(sample)

['tri', 'tri', 'tri', 'token', 'text']

In [0]:
# testing the pre_process function on a df
sample = comments_posts_df.sample(10)
pre_process_df(sample,'body')
sample

Unnamed: 0.1,Unnamed: 0,subreddit,body,subreddit_id
942287,942287,aww,[dinner],3
7394974,7394974,aww,"[stride, pride]",3
7378554,7378554,nba,"[gt, easili, goat, unanim, like, gretzkythat, ...",8
10411073,10411073,science,"[test, testosteron, level, erectil, function, ...",10
8919081,8919081,atheism,"[thisit, simplest, thing, guy, staff, member, ...",2
5057181,5057181,gaming,"[hi, messag, inform, post, comment, remov, bre...",6
5570466,5570466,europe,"[nt, vaccin, kid, risk, contract, danger, dise...",4
441420,441420,Fitness,"[heard, tone, calv, exact, bigger, genet]",0
10608766,10608766,nba,"[19s, game]",8
6583306,6583306,Fitness,"[tell, burn, 5000, calori, day, addit, exercis...",0


In [0]:
# appling the pre_process function to our comments_posts_df (%3h)
pre_process_df(comments_posts_df,'body')

In [0]:
comments_posts_df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,body,subreddit_id
0,0,aww,"[doubl, multipl, sub, approach, good, ya, mate]",3
1,1,aww,"[2nd, meatbal, serious, chunki, english, bulld...",3
2,2,aww,[thought],3
3,3,aww,"[friend, eat, friend]",3
4,4,aww,"[mayb, doctor, wait, heart, room]",3


In [0]:
!pwd

/content


In [0]:
comments_posts_df.to_csv('comments_posts_tokenized.csv')

In [0]:
!gsutil cp /content/comments_posts_tokenized.csv gs://reddit_final_results/

Copying file:///content/comments_posts_tokenized.csv [Content-Type=text/csv]...
/ [0 files][    0.0 B/  1.7 GiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

\
Operation completed over 1 objects/1.7 GiB.                                      


In [0]:
!zip comments_posts_tokenized.zip comments_posts_tokenized.csv

  adding: comments_posts_tokenized.csv (deflated 71%)


In [0]:
!gsutil cp /content/comments_posts_tokenized.zip gs://reddit_final_results/

Copying file:///content/comments_posts_tokenized.zip [Content-Type=application/zip]...
/ [0 files][    0.0 B/508.6 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/
Operation completed over 1 objects/508.6 MiB.                                    
