# Cleaning
----------------

In [2]:
# Imports

import pandas as pd
import numpy as np

from nltk.tokenize import RegexpTokenizer


In [3]:
# reading in the data
diy = pd.read_csv('raw data/diy.csv')
crafts = pd.read_csv('raw data/crafts.csv')

In [4]:
# checking that they are the correct and expected shape
print(diy.shape)
print(crafts.shape)

(7500, 4)
(7500, 4)


In [5]:
diy.head(10)

Unnamed: 0,title,score,comms_num,body
0,"Badly damaged exterior wall studs, is this out...",2,0,[Pictures.](https://imgur.com/a/uafBlta) This ...
1,Need help removing a screw from the back of a ...,1,1,
2,Add second post parallel to existing post,1,0,I have an idea to mount my HD antenna on a pol...
3,rtyuiooopfghjkl,1,0,
4,DIY - recycled vinyl record clock,1,1,
5,Easy & fast friendship bracelets - Simple Frie...,1,1,
6,"How to Cleaning & Scrub, 10 baht coin,King Ram...",1,0,
7,Any idea whats going on with my basement? Rot?...,1,1,
8,Need ideas for converting rain gutter into pla...,2,2,"For reference, [photos can be found here](http..."
9,Playing with rope,1,1,


In [6]:
crafts.head(10)

Unnamed: 0,title,score,comms_num,body
0,Im trying to make octopuses catch on as a Chri...,5,0,
1,"My finace is a drummer, I made this for him ou...",7,1,
2,[VIDEO] Which Silk Thread to Use for Blackwork...,2,0,
3,Topiary of fruits and flowers: master class. T...,0,0,
4,DIY Topiary tree with artificial fruit and flo...,0,1,
5,DIY Topiary tree with artificial fruit and flo...,0,2,
6,A Pixie Scrap Cap I made my daughter. :),3,0,
7,I made a Witch hat from a thrifted jacket.,9,0,
8,DIY Jewelry using foam sheets,1,0,Hello everyone i made DIY Earrings using foam ...
9,Thin rings. Material - acrylic stone,3,0,


### NaN values:
- The **NaN** alues in the body column are from posts that linked to a photo or video
- This was expected in data collection and they need to be filled with an empty string

In [8]:
# filling all NaN values with empty strings for the body column
diy.fillna('', inplace=True)
crafts.fillna('', inplace=True)

In [257]:
crafts.info()
diy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 4 columns):
title        7500 non-null object
score        7500 non-null int64
comms_num    7500 non-null int64
body         7500 non-null object
dtypes: int64(2), object(2)
memory usage: 234.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 4 columns):
title        7500 non-null object
score        7500 non-null int64
comms_num    7500 non-null int64
body         7500 non-null object
dtypes: int64(2), object(2)
memory usage: 234.5+ KB


### Checking for blank bodies of text

In [None]:
len([1 for s in crafts['body'] if s== ''])/crafts.shape[0]

In [None]:
len([1 for s in diy['body'] if s== ''])/diy.shape[0]

### Checking all deleted and removed posts
------

In [258]:
len([1 for s in crafts['body'] if (s == '[removed]') | (s == '[deleted]')])/crafts.shape[0]

0.13386666666666666

In [259]:
len([1 for s in diy['body'] if (s == '[removed]') | (s == '[deleted]')])/diy.shape[0]

0.4

I noted that the crafts subreddit had a much larger proportion of blank bodies of text. Here we see that a similar rate of posts from DIY were removed or deleted. While deleting all of them would be justified, this would create a deeply unbalanced class.In order to maintain a fairly balanced class, I will delete all the posts from DIY but keep the deleted ones. Then I will remove both the removed and deleted posts from crafts. Then I will combine the titles with the bodies to create a text columns.

In [260]:
for i, string in enumerate(diy['body']):
    if (string == '[removed]'):
        diy.drop(i, inplace=True)
    elif (string == '[deleted]'):
        diy['body'][i] = ''

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [261]:
for i, string in enumerate(crafts['body']):
    if (string == '[removed]'):
        crafts.drop(i, inplace=True)
    elif (string == '[deleted]'):
        crafts.drop(i, inplace=True)

In [262]:
diy.reset_index(inplace=True)
diy.drop(columns='index', inplace=True)
diy


Unnamed: 0,title,score,comms_num,body
0,"Badly damaged exterior wall studs, is this out...",2,0,[Pictures.](https://imgur.com/a/uafBlta) This ...
1,Need help removing a screw from the back of a ...,1,1,
2,Add second post parallel to existing post,1,0,I have an idea to mount my HD antenna on a pol...
3,rtyuiooopfghjkl,1,0,
4,DIY - recycled vinyl record clock,1,1,
...,...,...,...,...
5576,Help- how can I fix/hide these holes in my flo...,1,1,
5577,How do fix this?,1,1,
5578,Learn cooking and making sweet Learn how to ma...,1,1,
5579,Cats scratching wood bay window platform - Cov...,1,1,


In [263]:
crafts.reset_index(inplace=True)
crafts.drop(columns='index', inplace=True)
crafts

Unnamed: 0,title,score,comms_num,body
0,Im trying to make octopuses catch on as a Chri...,5,0,
1,"My finace is a drummer, I made this for him ou...",7,1,
2,[VIDEO] Which Silk Thread to Use for Blackwork...,2,0,
3,Topiary of fruits and flowers: master class. T...,0,0,
4,DIY Topiary tree with artificial fruit and flo...,0,1,
...,...,...,...,...
6491,Swipe Acrylic Pour On Canvas Panel #25,0,0,
6492,Im trying to get my awesome husband to believe...,667,39,
6493,flower vase with glass bottle,3,0,
6494,Crochet lalylala dolls,34,3,


## Decisions:
- I'll leave in the rows with blank body text as I will only be grabbing about 20% of whatever data I gather.
- Then I'll combine the title and the body to produce a description column

In [265]:
# combining the body and title columns

diy['text'] = diy['title'] + ' ' + diy['body']

crafts['text'] = crafts['title'] + ' ' + crafts['body']

## Deep scrubbing the data:
1. Get rid of special characters or characters that will prevent matching
2. find repetative posts that are worth deleting

In [266]:
# clean the text of punctuation

tokenizer = RegexpTokenizer('\s+', gaps=True)

for i in range(diy.shape[0]):
    text = diy['text'][i]
    tokens = tokenizer.tokenize(text)
    clean_tokens = []
    for token in tokens:
        token = token.replace(",", 
                      "").replace("?", 
                        "").replace("!", 
                          "").replace("'", 
                            "").replace(".",
                              "").replace("\'t", 
                               "").lower()
        clean_tokens.append(token)

    seperator = ' '
    diy['text'][i] = seperator.join(clean_tokens)
diy.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,title,score,comms_num,body,text
0,"Badly damaged exterior wall studs, is this out...",2,0,[Pictures.](https://imgur.com/a/uafBlta) This ...,badly damaged exterior wall studs is this out ...
1,Need help removing a screw from the back of a ...,1,1,,need help removing a screw from the back of a ...
2,Add second post parallel to existing post,1,0,I have an idea to mount my HD antenna on a pol...,add second post parallel to existing post i ha...
3,rtyuiooopfghjkl,1,0,,rtyuiooopfghjkl
4,DIY - recycled vinyl record clock,1,1,,diy - recycled vinyl record clock


In [267]:
for i in range(crafts.shape[0]):
    
    text = crafts['text'][i]
    tokens = tokenizer.tokenize(text)
    clean_tokens = []
    
    for token in tokens:
        token = token.replace(",",             # removing emotionally neutral punctuation
                      "").replace(".", 
                        "").replace("'", 
                          "").replace("!",     # separating and leaving in contextual punctuation
                            " ! ").replace("?",
                              " ? ").lower()
        if not '/imgurcom/' in token:
            clean_tokens.append(token)

    seperator = ' '
    crafts['text'][i] = seperator.join(clean_tokens)
crafts.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,title,score,comms_num,body,text
0,Im trying to make octopuses catch on as a Chri...,5,0,,im trying to make octopuses catch on as a chri...
1,"My finace is a drummer, I made this for him ou...",7,1,,my finace is a drummer i made this for him out...
2,[VIDEO] Which Silk Thread to Use for Blackwork...,2,0,,[video] which silk thread to use for blackwork...
3,Topiary of fruits and flowers: master class. T...,0,0,,topiary of fruits and flowers: master class to...
4,DIY Topiary tree with artificial fruit and flo...,0,1,,diy topiary tree with artificial fruit and flo...


now that the text has been cleaned and striped of a majority of the special

In [268]:
# dropping the columns to reduce the redundancy of the columns
clean_diy = diy.drop(columns=['title', 'body'])
clean_crafts = crafts.drop(columns=['title', 'body'])

# making the target prediction variable
clean_crafts['is_craft'] = 1
clean_diy['is_craft'] = 0

combined = pd.concat((clean_diy, clean_crafts))
combined.head()

After further reflection into the data, it would be important to drop posts that were repeating as it suggests

I will also be dropping the score and comms_nums as I'd like to keep this project explicitely focused on NLP.

In [271]:
combined.drop(columns=['score', 'comms_num'], inplace=True)

In [272]:
combined.shape[0]

12077

In [273]:
duplicates = []
for post in set(combined.text):
    if list(combined['text']).count(post) > 1:
        duplicates.append(post)
len(duplicates)

323

seems that not all of the duplicates are worth deleting. I will narrow down the list manually starting with the shortest titles

In [274]:
delete = []

for dup in duplicates:
    if len(dup) < 13:
        delete.append(dup)
        
delete

['',
 'youtube',
 'home decor',
 'chester',
 ' ',
 'paper leaves',
 'god of cod',
 'diy',
 'cardboard',
 '->:@db2580',
 'gate upgrade',
 'paper flower',
 ':',
 'pdf pattern']

In [275]:
delete = ['','youtube', ':', ' ', 'diy', '->:@db2580', '    ', '         ', ': : : : :']

In [276]:
combined.shape[0]

12077

In [277]:
for text in delete:
    combined.drop(combined.loc[combined['text'] == text].index, inplace=True)

In [278]:
combined.shape[0]

11987

In [279]:
combined.reset_index(inplace=True)
combined.drop(columns='index', inplace=True)
combined

Unnamed: 0,text,is_craft
0,badly damaged exterior wall studs is this out ...,0
1,need help removing a screw from the back of a ...,0
2,add second post parallel to existing post i ha...,0
3,rtyuiooopfghjkl,0
4,diy - recycled vinyl record clock,0
...,...,...
11982,swipe acrylic pour on canvas panel #25,1
11983,im trying to get my awesome husband to believe...,1
11984,flower vase with glass bottle,1
11985,crochet lalylala dolls,1


I think this is the most clean I can have the data set. I need to check for unbalanced classes

In [280]:
combined['is_craft'].value_counts(normalize=True)

1    0.538166
0    0.461834
Name: is_craft, dtype: float64

In [281]:
# # saving the cleaned files seperately before combining

clean_crafts = combined[combined['is_craft'] == 1]
clean_diy = combined[combined['is_craft'] == 0]

clean_crafts.to_csv('cleaned files/clean_crafts.csv', index=False)
clean_diy.to_csv('cleaned files/clean_diy.csv', index=False)

In [282]:
# saving final to csv
combined.to_csv('cleaned files/combined.csv')