import pandas as pd
import numpy as np

train = pd.read_csv("./dreaddit-train.csv",index_col="id")
test = pd.read_csv("./dreaddit-test.csv",index_col="id")

train.head()

In [146]:
print(train.shape)

print(test.shape)

(2838, 115)
(715, 115)


In [147]:
#cat and num segregation

categorical_col = [col for col in train.columns if
                 train[col].dtype == 'object']
numerical_col = [col for col in train.columns if
                 train[col].dtype in ['int64', 'float64']]

print(categorical_col)

print(numerical_col)


['subreddit', 'post_id', 'sentence_range', 'text']
['label', 'confidence', 'social_timestamp', 'social_karma', 'syntax_ari', 'lex_liwc_WC', 'lex_liwc_Analytic', 'lex_liwc_Clout', 'lex_liwc_Authentic', 'lex_liwc_Tone', 'lex_liwc_WPS', 'lex_liwc_Sixltr', 'lex_liwc_Dic', 'lex_liwc_function', 'lex_liwc_pronoun', 'lex_liwc_ppron', 'lex_liwc_i', 'lex_liwc_we', 'lex_liwc_you', 'lex_liwc_shehe', 'lex_liwc_they', 'lex_liwc_ipron', 'lex_liwc_article', 'lex_liwc_prep', 'lex_liwc_auxverb', 'lex_liwc_adverb', 'lex_liwc_conj', 'lex_liwc_negate', 'lex_liwc_verb', 'lex_liwc_adj', 'lex_liwc_compare', 'lex_liwc_interrog', 'lex_liwc_number', 'lex_liwc_quant', 'lex_liwc_affect', 'lex_liwc_posemo', 'lex_liwc_negemo', 'lex_liwc_anx', 'lex_liwc_anger', 'lex_liwc_sad', 'lex_liwc_social', 'lex_liwc_family', 'lex_liwc_friend', 'lex_liwc_female', 'lex_liwc_male', 'lex_liwc_cogproc', 'lex_liwc_insight', 'lex_liwc_cause', 'lex_liwc_discrep', 'lex_liwc_tentat', 'lex_liwc_certain', 'lex_liwc_differ', 'lex_liwc_perce

In [148]:
#EDA

# Using Pandas profiling. But, others like Dtale, Sweetviz, etc, can also be used
# import pandas_profiling as pp
# profile = pp.ProfileReport(train, minimal=True)
# profile.to_file(output_file="report.html")

In [None]:
# different subreddit types
train.subreddit.value_counts()

In [150]:
# post id, sentence range aren't useful - using common sense

train = train.drop(['post_id', 'sentence_range'], axis = 1)
test = test.drop(['post_id', 'sentence_range'], axis = 1)

In [151]:
X=train.copy()
y=train.label

X_test=test.copy()
y_test=test.label


In [152]:
# encoding subreddit text column
# the cardinality is 10

from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train['subreddit'].to_frame()))
OH_cols_test = pd.DataFrame(OH_encoder.transform(test['subreddit'].to_frame()))

OH_cols_train.index = train.index
OH_cols_test.index = test.index

num_X_train = train.drop(['subreddit'], axis=1)
num_X_test = test.drop(['subreddit'], axis=1)

X = pd.concat([num_X_train, OH_cols_train], axis=1)
X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

In [153]:
# text preprocessing helper functions
# Make text lowercase,
# removes hyperlinks,
# remove punctuation
# removes numbers
# tokenizes
# removes stopwords


import nltk,re,string
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    combined_text = ' '.join(tokenized_text)
    return combined_text

In [154]:
# Applying the cleaning function to both test and training datasets
X['text_clean'] = X['text'].apply(str).apply(lambda x: text_preprocessing(x))
X_test['text_clean'] = X_test['text'].apply(str).apply(lambda x: text_preprocessing(x))

In [155]:
X['text_len'] = X['text_clean'].astype(str).apply(len)
X['text_word_count'] = X['text_clean'].apply(lambda x: len(str(x).split()))

X_test['text_len'] = X_test['text_clean'].astype(str).apply(len)
X_test['text_word_count'] = X_test['text_clean'].apply(lambda x: len(str(x).split()))

In [156]:
print(X['text_len'].max())

1599


In [None]:
# for tokenizing
import transformers

#creating a function
def func_tokenizer(tokenizer_name, docs,max_len=1610):
    ids = []
    masks=[]
    segments=[]
    for doc in docs:
        tokens = tokenizer_name.encode_plus(doc, None,add_special_tokens=True,max_length=max_len, truncation_strategy='longest_first',truncation=True)
        token_ids=tokens["input_ids"]
        pad_len=max_len-len(token_ids)
        pad_masks=[1]*len(token_ids)+[0]*pad_len
        padding_id = tokenizer_name.pad_token_id
        token_ids = token_ids + ([padding_id] * pad_len)
        seg_ids=tokens["token_type_ids"]+[0]*pad_len
        ids.append(token_ids)
        masks.append(pad_masks)
        segments.append(seg_ids)
        
    return np.array(ids),np.array(masks),np.array(segments)

In [158]:
def text_to_tokenize_bert(X):
    # BERT tokenizer from hugging face
    bert_tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

    bert_features = func_tokenizer(bert_tokenizer, X['text_clean'])
    
    ids,masks,segments=bert_features[0],bert_features[1],bert_features[2]
    
    columns_len_tokenzed_data=len(ids[0])

    ids_df=pd.DataFrame(ids,index=X.index,columns=["Tokenized_ID_%d" % (i + 1) for i in range(columns_len_tokenzed_data)])

    masks_df=pd.DataFrame(masks,index=X.index,columns=["Tokenized_Mask_%d" % (i + 1) for i in range(columns_len_tokenzed_data)])

    segments_df=pd.DataFrame(segments,index=X.index,columns=["Tokenized_Segment_%d" % (i + 1) for i in range(columns_len_tokenzed_data)])
    bert_dfs=pd.concat([ids_df,masks_df,segments_df],axis=1)
    X = X.drop(['text', 'text_clean'], axis = 1)
    X=pd.concat([X,bert_dfs],axis=1)
    return X
    

In [159]:
X=text_to_tokenize_bert(X)
X_test=text_to_tokenize_bert(X_test)


1610
1610
1610
1610
1610
1610


In [160]:
#Pre-processed train and test data
print(X.head())
print(X_test.head())

       label  confidence  social_timestamp  social_karma  syntax_ari  \
id                                                                     
33181      1         0.8        1521614353             5    1.806818   
2606       0         1.0        1527009817             4    9.429737   
38816      1         0.8        1535935605             2    7.769821   
239        1         0.6        1516429555             0    2.667798   
1421       1         0.8        1539809005            24    7.554238   

       lex_liwc_WC  lex_liwc_Analytic  lex_liwc_Clout  lex_liwc_Authentic  \
id                                                                          
33181          116              72.64           15.04               89.26   
2606           109              79.08           76.85               56.75   
38816          167              33.80           76.38               86.24   
239            273               2.98           15.25               95.42   
1421            89              3

In [161]:
# writing pre-processed data to file

X.to_csv("pre_processed_train.csv")
X_test.to_csv("pre_processed_test.csv")