In [2]:
import praw
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
from pprint import pprint
%matplotlib inline
from sklearn.preprocessing import StandardScaler
import emoji 
import re
from emoji import emojize
from emoji import *
import seaborn as sns

## Import data 

In [3]:
sw = pd.read_csv('../Datasets_capstone/exploratory_datasets/eg2.csv')
del sw['Unnamed: 0']

In [4]:
sw.columns

Index(['comment_score', 'post_title', 'parent_created', 'num_parent_comments',
       'post_id', 'parent_comment', 'comment_body', 'comment_depth',
       'comment_edited', 'awards', 'controversial', 'comment_created',
       'comment_archived', 'comment_gilded', 'replies_to_comment'],
      dtype='object')

## General cleaning

In [5]:
sw.rename(columns = {'parent_comment':'comment_id'}, inplace = True)

In [9]:
sw.dropna(inplace=True)

In [10]:
#removing newline characters
sw.comment_body = sw.comment_body.replace('\n', ' ', regex=True)

In [11]:
sw.replies_to_comment = sw.replies_to_comment.replace('\n', ' ', regex=True)

In [12]:
sw.shape

(72803, 15)

## Feature Engineering Functions

### Returning:
- Correct format of year, day and hour comments were made
- Binary formatting to comment edited 
- Length of comment
- Special characters
- Uppercase ratio of words 
- Emojis text (smiley face)
- Emojis 
- Binary formatting to above 
- URLs
- Time between comment and original post

In [13]:
def date(cell):
    datetime = pd.to_datetime(cell, unit='s')  
    return datetime.year, datetime.day_name(), datetime.hour

In [14]:
def comment_edited_date(cell):
    try:
        datetime = pd.to_datetime(cell, unit='s')  
        return datetime.year, datetime.day_name(), datetime.hour
    except:
        return 'False'

In [15]:
def title_l(cell):   
    return len(cell.split(' '))

In [16]:
def special_char(cell):
    return re.findall("[^()_;’“'\,\.\[\]\w -]", cell)

In [17]:
def uppercase_ratio(cell):
    try:  
        return len(re.findall('[A-Z]', cell))/len(re.findall('[A-Za-z]', cell))
    except:
        pass

In [18]:
def extract_text_em(cell):
    emojis = ''.join(c for c in cell if c in UNICODE_EMOJI)
    return demojize(emojis)

In [19]:
def extract_emojis(cell):
    emojis = ''.join(c for c in cell if c in UNICODE_EMOJI)
    return emojis

In [20]:
def convert_empty(cell):
    if cell:
        return cell
    else:
        return 'no_emoji'

In [21]:
def binarise_em(cell):
    if cell:
        return 1
    else:
        return 0 

In [22]:
def URLS(cell):
    return re.findall('(www.[a-zA-Z]*.com/[a-zA-Z]*)', cell)

In [23]:
def comment_l(cell):
    if cell == '[removed]' or cell == '[deleted]':
        return 0
    else:
        return len(cell.split(' '))

In [24]:
def time_diff(parent, comment):
    datetime_parent = pd.to_datetime(parent, unit='s')  
    datetime_comment = pd.to_datetime(comment, unit='s') 
    return (datetime_comment - datetime_parent)

## New columns

In [25]:
sw['upper_rcomment'] = sw.comment_body.apply(lambda x: uppercase_ratio(x))

In [26]:
sw.columns

Index(['comment_score', 'post_title', 'parent_created', 'num_parent_comments',
       'post_id', 'comment_id', 'comment_body', 'comment_depth',
       'comment_edited', 'awards', 'controversial', 'comment_created',
       'comment_archived', 'comment_gilded', 'replies_to_comment',
       'upper_rcomment'],
      dtype='object')

In [27]:
sw['day_time_parent'] = sw.parent_created.apply(lambda x: date(x))

In [28]:
sw['day_time_comment'] = sw.comment_created.apply(lambda x: date(x))

In [29]:
sw['comment_length'] = sw.comment_body.apply(lambda x: comment_l(x)) 

In [30]:
sw['spec_char'] = sw.comment_body.apply(lambda x: special_char(x))

In [31]:
sw['text_emoji'] = sw.comment_body.apply(lambda x: extract_text_em(x))
sw['binary_emoji'] = sw.comment_body.apply(lambda x: extract_emojis(x))

In [32]:
sw.text_emoji =sw.text_emoji.apply(lambda x: convert_empty(x))
sw.binary_emoji = sw.binary_emoji.apply(lambda x: binarise_em(x))

In [33]:
sw['time_diff'] = time_diff(sw['parent_created'], sw['comment_created'])

In [34]:
sw['comment_URL'] = sw.comment_body.apply(lambda x: URLS(x))

In [35]:
sw.replies_to_comment = sw.replies_to_comment.astype(str)

In [36]:
sw['reply_URL'] = sw.replies_to_comment.apply(lambda x: URLS(x))

In [37]:
sw.comment_URL = sw.comment_URL.apply(lambda x: 'No_URL' if x =='[]'else x)

In [38]:
sw.reply_URL = sw.reply_URL.apply(lambda x: 'No_URL' if x =='[]'else x)

In [39]:
sw['year_parent'] = sw['day_time_parent'].apply(lambda x: x[0])
sw['day_of_week_parent'] = sw['day_time_parent'].apply(lambda x: x[1])
sw['time_parent'] = sw['day_time_parent'].apply(lambda x: x[2])

In [40]:
sw['year_comment'] = sw['day_time_comment'].apply(lambda x: x[0])
sw['day_of_week_comment'] = sw['day_time_comment'].apply(lambda x: x[1])
sw['time_comment'] = sw['day_time_comment'].apply(lambda x: x[2])

In [41]:
sw.shape

(72803, 31)

# Save as csv 

In [42]:
sw_cleaned = sw.to_csv('sw_cleaned2.csv')