In [3]:
# coding: utf-8
import pandas as pd
import numpy as np
import datetime
import re
import unicodedata
import time
from tqdm import tqdm
from contractionExpander import expand_text
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk import bigrams
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import TweetTokenizer
from string import punctuation

In [1]:
def setup():
    start = time.time()
    tqdm.pandas()
    print ('Running setup...')
    resources = ['taggers/averaged_perceptron_tagger', 'corpora/wordnet', 'corpora/stopwords', 'tokenizers/punkt']
    for path in resources:
        try:
            nltk.data.find(path)
        except LookupError:
            nltk.download(path.split('/')[1])
    end = time.time()
    print (f'Setup finished in {end-start:.2f} seconds.\n')

In [4]:
setup()

Running setup...
Setup finished in 0.00 seconds.



In [6]:
def days_to_date(srl_no, first=datetime.datetime(2010,1,1,0,0)):
    days = int(srl_no-1)
    new_date = first + datetime.timedelta(days)
    return new_date

In [7]:
def read_data(filename):
    print('Reading data...')
    start = time.time()
    tweets = pd.read_csv(filename, encoding='latin-1', sep=';',header=0, names=['StringDate', 'Days', 'From', 'Tweet'])
    tweets = tweets.filter(items=['Days', 'From', 'Tweet'])
    tweets['Days'] = tweets['Days'].progress_apply(days_to_date)
    tweets.columns=['Date', 'From', 'Tweet']
    tweets = tweets.reset_index()
    tweets = tweets.set_index('Date').sort_index()
    end = time.time()
    print (f'Data read in {end-start:.2f} seconds.\n')
    return tweets

In [8]:
input = 'twitter.csv'
data = read_data(input)

Reading data...


100%|██████████| 645981/645981 [00:01<00:00, 407592.64it/s]


Data read in 3.07 seconds.



In [10]:
data.head(3)

Unnamed: 0_level_0,index,From,Tweet
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,65282,@SAI,FOX -Time Warner Spat Shows Why The Death Of T...
2010-01-01,502287,@TechCrunch,Ten Technologies That Will Rock 2010 http://tc...
2010-01-01,502288,@TechCrunch,"Hotel WiFi Should Be a Right, Not a Luxury htt..."


In [11]:
def remove_accents(text):
    text = unicodedata.normalize('NFD', str(text)).encode('ascii', 'ignore').decode("utf-8").lower()
    return str(text)

def remove_apostrophes(text):
    text = re.sub(r"\'s", "", text)
    return text

def remove_hashtags(text):
    #hashtags and handles
    text = re.sub(r'\B(\#[a-zA-Z]+|\@[a-zA-Z]+\b)', '', text)
    return text

def remove_urls(text):
    text= re.sub(r'http\S+', '', text)
    return text

def remove_numberwords(text):
    text= re.sub(r'\b[0-9]+\b\s*', '', text)
    return text

In [12]:
def clean_text(df, text_column):
    tqdm.write(f'Cleaning up {text_column} texts...')
    start = time.time()
    tqdm.write('.... removing accents')
    df[text_column] = df[text_column].progress_apply(remove_accents)
    tqdm.write('.... removing URLs')
    df[text_column] = df[text_column].progress_apply(remove_urls)
    tqdm.write('.... removing hashtags')
    df[text_column] = df[text_column].progress_apply(remove_hashtags)
    tqdm.write('.... removing apostrophes')
    df[text_column] = df[text_column].progress_apply(remove_apostrophes) 
    tqdm.write('.... removing numbers')
    df[text_column] = df[text_column].progress_apply(remove_numberwords) 
    tqdm.write('.... expanding contractions')
    df[text_column] = df[text_column].progress_apply(expand_text)
    end = time.time()
    tqdm.write (f'Text cleanup finished in {end-start:.2f} seconds.\n')
    return df

In [13]:
cleaned = clean_text(data, 'Tweet')

  2%|▏         | 15848/645981 [00:00<00:03, 158472.22it/s]

Cleaning up Tweet texts...
.... removing accents


100%|██████████| 645981/645981 [00:02<00:00, 317651.93it/s]
  8%|▊         | 51402/645981 [00:00<00:02, 242424.09it/s]

.... removing URLs


100%|██████████| 645981/645981 [00:02<00:00, 289635.77it/s]
  5%|▌         | 34160/645981 [00:00<00:03, 163328.83it/s]

.... removing hashtags


100%|██████████| 645981/645981 [00:03<00:00, 174042.07it/s]
  4%|▍         | 27153/645981 [00:00<00:02, 271529.61it/s]

.... removing apostrophes


100%|██████████| 645981/645981 [00:01<00:00, 340264.19it/s]
  5%|▌         | 33893/645981 [00:00<00:03, 160984.98it/s]

.... removing numbers


100%|██████████| 645981/645981 [00:03<00:00, 179268.27it/s]
  0%|          | 970/645981 [00:00<02:20, 4599.07it/s]

.... expanding contractions


100%|██████████| 645981/645981 [02:16<00:00, 4727.78it/s]


Text cleanup finished in 150.45 seconds.



In [None]:
#cleaned.to_pickle('cleaned.data')



In [15]:
def _tokenize(text):
    new_text = []
    for word, tag in pos_tag(tknzr.tokenize(text)):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 's'] else None
        if wntag:  # remove verbs
            lemma = lmtzr.lemmatize(word, wntag)
            new_text.append(lemma)
    return new_text


def tokenize(df, text_column):
    print(f'Tokenizing Dataframe["{text_column}"].')
    start = time.time()
    #df['Unigrams'] = df[text_column].progress_apply(_tokenize)
    df['Unigrams'] = df[text_column].progress_apply(tknzr.tokenize)
    end = time.time()
    print(f'Dataframe["{text_column}"] tokenized in {end-start:.2f} seconds.\n')
    return df

In [18]:
tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
lmtzr = WordNetLemmatizer()
tokens = tokenize(cleaned, 'Tweet')

  0%|          | 317/645981 [00:00<03:32, 3038.09it/s]

Tokenizing Dataframe["Tweet"].


100%|██████████| 645981/645981 [00:53<00:00, 12147.03it/s]


Dataframe["Tweet"] tokenized in 53.44 seconds.



In [17]:
def remove_stopwords(input):
    stop_words = set(stopwords.words('english') + list(punctuation) + [' ', 'rt', '...', '-->', ']:', '}:'])
    output = [i for i in input if i not in stop_words]
    return output

def remove_extremewords(input):
    output = [i for i in input if (len(i)<20 and len(i)>1)]   
    return output

In [20]:
def clean_tokens(df):
    tqdm.write('Cleaning up tokens...')
    start = time.time()
    tqdm.write('.... removing stop words')
    df['Unigrams'] = df['Unigrams'].progress_apply(remove_stopwords)
    tqdm.write('.... removing extreme words')
    df['Unigrams'] = df['Unigrams'].progress_apply(remove_extremewords)
    end = time.time()
    tqdm.write (f'Tokens cleanup finished in {end-start:.2f} seconds.\n')
    return df

In [21]:
cleantokens = clean_tokens(tokens)

  0%|          | 649/645981 [00:00<04:44, 2265.09it/s]

Cleaning up tokens...
.... removing stop words


100%|██████████| 645981/645981 [02:06<00:00, 5119.29it/s]
  3%|▎         | 19606/645981 [00:00<00:03, 195979.82it/s]

.... removing extreme words


100%|██████████| 645981/645981 [00:03<00:00, 210588.14it/s]


Tokens cleanup finished in 129.56 seconds.



In [28]:
x = cleantokens['Unigrams'][0]

In [31]:
[f'{tuple[0]} {tuple[1]}' for tuple in list(bigrams(x))]

['fox time',
 'time warner',
 'warner spat',
 'spat shows',
 'shows death',
 'death tv',
 'tv cannot',
 'cannot come',
 'come soon',
 'soon enough']

In [32]:
df = cleantokens

In [None]:
def generateBigrams(df):
    tqdm.write('.... generating bigrams')
    start = time.time()
    df['Bigrams'] = df['Unigrams'].progress_apply(lambda x: [f'{tuple[0]} {tuple[1]}' for tuple in list(bigrams(x))])
    df['NumTokens']=df['Unigrams'].apply(len)
    df['NumBigrams']=df['Bigrams'].apply(len)
#     df=df[df['NumTokens']<40]
#     df=df[df['NumTokens']>1]
    end = time.time()
    tqdm.write (f'Tokens cleanup finished in {end-start:.2f} seconds.\n')
    return df

In [None]:

output = 'tokenized.data'
generateBigrams(df).to_pickle(output)