In [1]:
import numpy as np
import pandas as pd
import warnings
import nltk

import spacy
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
import unicodedata
from collections import Counter
import wordcloud
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')
nlp = spacy.load('en_core_web_sm', parse = True, tag=True, entity=True)

#### Functions:
Subscriber: 'k'/'m' to numbers <br>
View: 'k'/'m' to numbers <br>
Uploadtime: 'hours/days/weeks/months/years ago' to number of hours <br>
Length: to total seconds <br>

In [2]:

def normalize_document(doc):
    """
    This function takes string as argument and output cleaned texts for further analysis
    
    
    """
    wpt = nltk.WordPunctTokenizer()
    stop_words = nltk.corpus.stopwords.words('english')
    
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    
    # remove accented characters
    doc = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # tokenize document
    tokens = wpt.tokenize(doc)
    # lemmatizing
    
    #doc = lemmatize_text(tokens)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # re-create document from filtered tokens, lemmatized
    doc = ' '.join([lemmatizer.lemmatize(w,pos='n') for w in filtered_tokens])
    return doc

In [3]:
def clean(data):
    """
    This function takes in a pandas data set, 
    drops missing values, clean the text, 
    transfers subscribers, views and time after uploads from string to numbers
    and output a new data frame.
    
    
    """
    data = data.dropna()
    data = data.reset_index(drop=True)
    
    data.subscriber = data.subscriber.replace(r'[KM]+$', '', regex=True).astype(float)*\
                data.subscriber.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [10**3, 10**6])
    data.subscriber = data.subscriber.astype(int)
    
    data.view = data.view.str.replace('views', '')
    data.view = data.view.str.replace(' ', '')
    data.view = data.view.replace(r'[KM]+$', '', regex=True).astype(float)*\
                data.view.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [10**3, 10**6])
    data.view = data.view.astype(int)
    
    data.uploadtime = data.uploadtime.str.replace('Streamed', ''); data.uploadtime = data.uploadtime.str.replace('ago', '')
    data.uploadtime = data.uploadtime.str.replace('hours','H'); data.uploadtime = data.uploadtime.str.replace('hour','H')
    data.uploadtime = data.uploadtime.str.replace('days','D'); data.uploadtime = data.uploadtime.str.replace('day','D')
    data.uploadtime = data.uploadtime.str.replace('weeks','W'); data.uploadtime = data.uploadtime.str.replace('week','W')
    data.uploadtime = data.uploadtime.str.replace('months','M'); data.uploadtime = data.uploadtime.str.replace('month','M')
    data.uploadtime = data.uploadtime.str.replace('years','Y'); data.uploadtime = data.uploadtime.str.replace('year','Y')
    data.uploadtime = data.uploadtime.str.replace(' ','')
    
    data.uploadtime = data.uploadtime.replace(r'[HDWMY]+$', '', regex=True).astype(float)*\
                data.uploadtime.str.extract(r'[\d\.]+([HDWMY]+)', expand=False).fillna(1).replace(['H','D','W','M','Y'], [1,24,7*24,30*24,365*24])
    data.uploadtime = data.uploadtime.astype(int)
    
    warnings.filterwarnings('ignore')
    for i in range(data.shape[0]):
        if len(data.length[i])==4: data.length[i] = '00:0'+data.length[i]
        if len(data.length[i])==5: data.length[i] = '00:'+data.length[i]

    data.length = pd.to_timedelta(data.length).dt.total_seconds().astype(int)

    data.like = data.like.astype(int)
    data.dislike = data.dislike.astype(int)
    
    normalize_corpus = np.vectorize(normalize_document)
    data['clean_text'] = normalize_corpus(data['title'])
    
    return data

#### Import and Clean our data.

In [7]:
data = pd.read_csv(r'data/video_03102020.csv', index_col = 0, thousands = ',', encoding = 'utf-8')
data = clean(data)
data.to_csv('data/video_clean.csv')

In [None]:
data2 = pd.read_csv(r'data/video_03142020_143324.csv', index_col = 0, thousands = ',', encoding = 'utf-8')
data2 = clean(data2)
data2.to_csv('data/video_clean2.csv')