# Text Data Cleaning

In [2]:
# load the packages
import json  
import re
from collections import Counter
import copy
import string
import time
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize  
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from textblob import TextBlob
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
from nltk.tag import StanfordNERTagger
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

## Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

from sklearn.metrics import classification_report

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

SEED = 42

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiaying/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jiaying/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jiaying/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jiaying/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Using TensorFlow backend.


In [3]:
text = pd.read_csv('text.csv', lineterminator='\n')

In [4]:
text.head()

Unnamed: 0,title,channel_title,tags,description,category,country,views,popular
0,Eminem - Walk On Water (Audio) ft. Beyonc√©,EminemVEVO,"Eminem|""Walk""|""On""|""Water""|""Aftermath/Shady/In...",Eminem's new track Walk on Water ft. Beyonc√© i...,Music,CA,17158579,1
1,PLUSH - Bad Unboxing Fan Mail,iDubbbzTV,"plush|""bad unboxing""|""unboxing""|""fan mail""|""id...",STill got a lot of packages. Probably will las...,Comedy,CA,1014651,0
2,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",WATCH MY PREVIOUS VIDEO ‚ñ∂ \n\nSUBSCRIBE ‚ñ∫ http...,Comedy,CA,3191434,1
3,I Dare You: GOING BALD!?,nigahiga,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",I know it's been a while since we did this sho...,Entertainment,CA,2095828,1
4,Ed Sheeran - Perfect (Official Music Video),Ed Sheeran,"edsheeran|""ed sheeran""|""acoustic""|""live""|""cove...",üéß: https://ad.gt/yt-perfect\nüí∞: https://atlant...,Music,CA,33523622,1


<b> Description

In [5]:
text.description[:50]

0     Eminem's new track Walk on Water ft. Beyonc√© i...
1     STill got a lot of packages. Probably will las...
2     WATCH MY PREVIOUS VIDEO ‚ñ∂ \n\nSUBSCRIBE ‚ñ∫ http...
3     I know it's been a while since we did this sho...
4     üéß: https://ad.gt/yt-perfect\nüí∞: https://atlant...
5     ‚ñ∫ Follow for News! - https://twitter.com/KEEMS...
6     Vanoss Merch Shop: https://vanoss.3blackdot.co...
7     SHANTELL'S CHANNEL - https://www.youtube.com/s...
8     Join the movement. Be a Maverick ‚ñ∫ https://Sho...
9     Sheldon is roasting pastor of the church\nyoun...
10    Watch the official music video of Bank Account...
11    Subscribe Here: http://bit.ly/2uaz0on\n12 Hot ...
12    Thanks for watching the drama! Help more peopl...
13    Song - Daang\nSinger - Mankirt Aulakh\nFaceboo...
14    CLICK TO SUBSCRIBE TO THE YOUTUBERS IN THIS EP...
15    In the Outsmarted finale, Mike trains with an ...
16    3 Days left to cop NELK merch: https://nelk.ca...
17    I think Sarah Millican was 

In [6]:
text.description.isnull().sum()

2478

In [7]:
text.description.fillna('', inplace = True)

In [8]:
# remove backslash + 1 character
desc_1 = text.description.str.replace('\\\\.', '', flags=re.MULTILINE)

In [9]:
desc_1[:50]

0     Eminem's new track Walk on Water ft. Beyonc√© i...
1     STill got a lot of packages. Probably will las...
2     WATCH MY PREVIOUS VIDEO ‚ñ∂ SUBSCRIBE ‚ñ∫ https://...
3     I know it's been a while since we did this sho...
4     üéß: https://ad.gt/yt-perfectüí∞: https://atlanti....
5     ‚ñ∫ Follow for News! - https://twitter.com/KEEMS...
6     Vanoss Merch Shop: https://vanoss.3blackdot.co...
7     SHANTELL'S CHANNEL - https://www.youtube.com/s...
8     Join the movement. Be a Maverick ‚ñ∫ https://Sho...
9     Sheldon is roasting pastor of the churchyoung ...
10    Watch the official music video of Bank Account...
11    Subscribe Here: http://bit.ly/2uaz0on12 Hot Gl...
12    Thanks for watching the drama! Help more peopl...
13    Song - DaangSinger - Mankirt AulakhFacebook - ...
14    CLICK TO SUBSCRIBE TO THE YOUTUBERS IN THIS EP...
15    In the Outsmarted finale, Mike trains with an ...
16    3 Days left to cop NELK merch: https://nelk.ca...
17    I think Sarah Millican was 

In [10]:
# remove urls
desc_2 = desc_1.str.replace('http\S+', '', flags=re.MULTILINE)

In [11]:
desc_2[:50]

0     Eminem's new track Walk on Water ft. Beyonc√© i...
1     STill got a lot of packages. Probably will las...
2     WATCH MY PREVIOUS VIDEO ‚ñ∂ SUBSCRIBE ‚ñ∫  FOR WAT...
3     I know it's been a while since we did this sho...
4     üéß:   to Ed's channel:  Ed on...Facebook:    We...
5     ‚ñ∫ Follow for News! -  Also follow #DramaAlert ...
6     Vanoss Merch Shop:  by: Evan Fong  by: Jack Wa...
7     SHANTELL'S CHANNEL -  -  this video in 4k on t...
8     Join the movement. Be a Maverick ‚ñ∫  ONE CAN ST...
9     Sheldon is roasting pastor of the churchyoung ...
10    Watch the official music video of Bank Account...
11    Subscribe Here:  Hot Glue Gun Life Hacks For C...
12    Thanks for watching the drama! Help more peopl...
13    Song - DaangSinger - Mankirt AulakhFacebook - ...
14    CLICK TO SUBSCRIBE TO THE YOUTUBERS IN THIS EP...
15    In the Outsmarted finale, Mike trains with an ...
16    3 Days left to cop NELK merch:  us on Instagra...
17    I think Sarah Millican was ver

In [12]:
# only keep word characters and whitespaces
desc_3 = desc_2.str.replace('[^\w\s]', '', flags=re.UNICODE)

In [13]:
desc_3[:50]

0     Eminems new track Walk on Water ft Beyonc√© is ...
1     STill got a lot of packages Probably will last...
2     WATCH MY PREVIOUS VIDEO  SUBSCRIBE   FOR WATCH...
3     I know its been a while since we did this show...
4        to Eds channel  Ed onFacebook    Website  J...
5      Follow for News   Also follow DramaAlert on I...
6     Vanoss Merch Shop  by Evan Fong  by Jack Wagne...
7     SHANTELLS CHANNEL     this video in 4k on this...
8     Join the movement Be a Maverick   ONE CAN STOP...
9     Sheldon is roasting pastor of the churchyoung ...
10    Watch the official music video of Bank Account...
11    Subscribe Here  Hot Glue Gun Life Hacks For Cr...
12    Thanks for watching the drama Help more people...
13    Song  DaangSinger  Mankirt AulakhFacebook     ...
14    CLICK TO SUBSCRIBE TO THE YOUTUBERS IN THIS EP...
15    In the Outsmarted finale Mike trains with an M...
16    3 Days left to cop NELK merch  us on Instagram...
17    I think Sarah Millican was very excited f

In [14]:
# remove numbers
desc_4 = desc_3.str.replace('\d+', '')

In [15]:
desc_4[:50]

0     Eminems new track Walk on Water ft Beyonc√© is ...
1     STill got a lot of packages Probably will last...
2     WATCH MY PREVIOUS VIDEO  SUBSCRIBE   FOR WATCH...
3     I know its been a while since we did this show...
4        to Eds channel  Ed onFacebook    Website  J...
5      Follow for News   Also follow DramaAlert on I...
6     Vanoss Merch Shop  by Evan Fong  by Jack Wagne...
7     SHANTELLS CHANNEL     this video in k on this ...
8     Join the movement Be a Maverick   ONE CAN STOP...
9     Sheldon is roasting pastor of the churchyoung ...
10    Watch the official music video of Bank Account...
11    Subscribe Here  Hot Glue Gun Life Hacks For Cr...
12    Thanks for watching the drama Help more people...
13    Song  DaangSinger  Mankirt AulakhFacebook     ...
14    CLICK TO SUBSCRIBE TO THE YOUTUBERS IN THIS EP...
15    In the Outsmarted finale Mike trains with an M...
16     Days left to cop NELK merch  us on Instagramn...
17    I think Sarah Millican was very excited f

In [16]:
# convert to lower case
desc_5 = desc_4.str.lower()

In [17]:
# remove leading and ending spaces
desc_6 = desc_5.str.strip()

In [18]:
desc_6[:50]

0     eminems new track walk on water ft beyonc√© is ...
1     still got a lot of packages probably will last...
2     watch my previous video  subscribe   for watch...
3     i know its been a while since we did this show...
4     to eds channel  ed onfacebook    website  jaso...
5     follow for news   also follow dramaalert on in...
6     vanoss merch shop  by evan fong  by jack wagne...
7     shantells channel     this video in k on this ...
8     join the movement be a maverick   one can stop...
9     sheldon is roasting pastor of the churchyoung ...
10    watch the official music video of bank account...
11    subscribe here  hot glue gun life hacks for cr...
12    thanks for watching the drama help more people...
13    song  daangsinger  mankirt aulakhfacebook     ...
14    click to subscribe to the youtubers in this ep...
15    in the outsmarted finale mike trains with an m...
16    days left to cop nelk merch  us on instagramne...
17    i think sarah millican was very excited f

In [19]:
# tokenization
tokenizer = RegexpTokenizer(r'\w+')
desc_token = desc_6.apply(lambda x: tokenizer.tokenize(x))

In [20]:
desc_token

0         [eminems, new, track, walk, on, water, ft, bey...
1         [still, got, a, lot, of, packages, probably, w...
2         [watch, my, previous, video, subscribe, for, w...
3         [i, know, its, been, a, while, since, we, did,...
4         [to, eds, channel, ed, onfacebook, website, ja...
                                ...                        
120741    [the, cat, who, caught, the, laser, aarons, an...
120742                                                   []
120743    [i, had, so, much, fun, transforming, safiyas,...
120744    [how, black, panther, should, have, endedwatch...
120745    [call, of, duty, black, ops, multiplayer, rais...
Name: description, Length: 120746, dtype: object

In [21]:
# stemming
stemmer = PorterStemmer()
desc_stem = desc_token.apply(lambda x: [stemmer.stem(word) for word in x])

In [22]:
desc_stem

0         [eminem, new, track, walk, on, water, ft, beyo...
1         [still, got, a, lot, of, packag, probabl, will...
2         [watch, my, previou, video, subscrib, for, wat...
3         [i, know, it, been, a, while, sinc, we, did, t...
4         [to, ed, channel, ed, onfacebook, websit, jaso...
                                ...                        
120741     [the, cat, who, caught, the, laser, aaron, anim]
120742                                                   []
120743    [i, had, so, much, fun, transform, safiya, hai...
120744    [how, black, panther, should, have, endedwatch...
120745    [call, of, duti, black, op, multiplay, rais, t...
Name: description, Length: 120746, dtype: object

In [23]:
# lemmatization
lemmatizer = WordNetLemmatizer()
desc_lemma = desc_stem.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [24]:
desc_lemma

0         [eminem, new, track, walk, on, water, ft, beyo...
1         [still, got, a, lot, of, packag, probabl, will...
2         [watch, my, previou, video, subscrib, for, wat...
3         [i, know, it, been, a, while, sinc, we, did, t...
4         [to, ed, channel, ed, onfacebook, websit, jaso...
                                ...                        
120741     [the, cat, who, caught, the, laser, aaron, anim]
120742                                                   []
120743    [i, had, so, much, fun, transform, safiya, hai...
120744    [how, black, panther, should, have, endedwatch...
120745    [call, of, duti, black, op, multiplay, rais, t...
Name: description, Length: 120746, dtype: object

In [24]:
desc_6 = pd.DataFrame(desc_6)
desc_token = pd.DataFrame(desc_token)
desc_stem = pd.DataFrame(desc_stem)
desc_lemma = pd.DataFrame(desc_lemma)

In [25]:
desc_6.to_csv('desc_clean.csv', index = False)

In [26]:
desc_token.to_csv('desc_token.csv', index = False)

In [27]:
desc_stem.to_csv('desc_stem.csv', index = False)

In [28]:
desc_lemma.to_csv('desc_lemma.csv', index = False)