In [1]:
# coding: utf-8
import pandas as pd
import numpy as np
import datetime
import re
import unicodedata
import time
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk import bigrams
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import TweetTokenizer
from string import punctuation
from configparser import ConfigParser, ExtendedInterpolation

In [2]:
def setup():
    start = time.time()
    tqdm.pandas()
    print ('Running setup...')
    resources = ['taggers/averaged_perceptron_tagger', 'corpora/wordnet', 'corpora/stopwords', 'tokenizers/punkt']
    for path in resources:
        try:
            nltk.data.find(path)
        except LookupError:
            nltk.download(path.split('/')[1])
    end = time.time()
    print (f'Setup finished in {end-start:.2f} seconds.\n')

In [3]:
setup()

Running setup...
Setup finished in 0.00 seconds.



In [4]:
def days_to_date(srl_no, first=datetime.datetime(2010,1,1,0,0)):
    days = int(srl_no-1)
    new_date = first + datetime.timedelta(days)
    return new_date

In [5]:
def read_data(filename):
    print('Reading data...')
    start = time.time()
    tweets = pd.read_csv(filename, encoding='latin-1', sep=';',header=0, names=['StringDate', 'Days', 'From', 'Tweet'])
    tweets = tweets.filter(items=['Days', 'From', 'Tweet'])
    tweets['Days'] = tweets['Days'].progress_apply(days_to_date)
    tweets.columns=['Date', 'From', 'Tweet']
    tweets = tweets.reset_index()
    tweets = tweets.set_index('Date').sort_index()
    end = time.time()
    print (f'Data read in {end-start:.2f} seconds.\n')
    return tweets

In [6]:
input = '../data/twitter.csv'
data = read_data(input)

Reading data...


100%|██████████| 645981/645981 [00:01<00:00, 364901.25it/s]


Data read in 3.55 seconds.



In [19]:
data.head(3)

Unnamed: 0_level_0,index,From,Tweet
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,65282,@SAI,FOX -Time Warner Spat Shows Why The Death Of T...
2010-01-01,502287,@TechCrunch,Ten Technologies That Will Rock 2010 http://tc...
2010-01-01,502288,@TechCrunch,"Hotel WiFi Should Be a Right, Not a Luxury htt..."


In [20]:
data['From'] = data['From'].str.lower()

In [21]:
data.head()

Unnamed: 0_level_0,index,From,Tweet
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,65282,@sai,FOX -Time Warner Spat Shows Why The Death Of T...
2010-01-01,502287,@techcrunch,Ten Technologies That Will Rock 2010 http://tc...
2010-01-01,502288,@techcrunch,"Hotel WiFi Should Be a Right, Not a Luxury htt..."
2010-01-01,502289,@techcrunch,Twitter and Me! Why It's The Only Social Medi...
2010-01-01,160763,@techreview,Physics arXiv Blog's Highlights of 2009: Septe...


In [23]:
data = data[~data['From'].isin([source.lower() for source in ['@SAI', '@TechCrunch']])]
data.head()

Unnamed: 0_level_0,index,From,Tweet
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,160763,@techreview,Physics arXiv Blog's Highlights of 2009: Septe...
2010-01-01,213682,@mashable,One Website Does Your Resolutions for You Al...
2010-01-01,213683,@mashable,"In 2010, Your iPhone Could Be a Credit Card Re..."
2010-01-01,213684,@mashable,10 Easy Ways to Green Your Web Site - http://b...
2010-01-01,213685,@mashable,PayPal vs Fake PayPal: Can You Tell the Differ...


In [10]:
config = ConfigParser(
    inline_comment_prefixes="#;",
    interpolation=ExtendedInterpolation())
config.read('../config.ini')

['../config.ini']

In [19]:
import json

In [20]:
json.dumps(config)

TypeError: Object of type ConfigParser is not JSON serializable

In [None]:
with open('../config.ini', 'w') as configfile:
    config.write(configfile)

In [30]:
data = data[~data['From']
        .isin(
            [source.strip().lower()
                for source in 
                    config['General']['exclude_sources'].split(",")])]

In [35]:
def remove_accents(text):
    text = unicodedata.normalize('NFD', str(text)).encode('ascii', 'ignore').decode("utf-8").lower()
    return str(text)

def remove_apostrophes(text):
    text = re.sub(r"\'s", "", text)
    return text

def remove_hashtags(text):
    #hashtags and handles
    text = re.sub(r'\B(\#[a-zA-Z]+|\@[a-zA-Z]+\b)', '', text)
    return text

def remove_urls(text):
    text= re.sub(r'http\S+', '', text)
    return text

def remove_numberwords(text):
    text= re.sub(r'\b[0-9]+\b\s*', '', text)
    return text

In [36]:
def clean_text(df, text_column):
    tqdm.write(f'Cleaning up {text_column} texts...')
    start = time.time()
    actions = [action.strip().lower() for action in config['Text Cleaning']['actions'].split(",")]
    for action in actions:
        tqdm.write('.... ' + action)
        df[text_column] = df[text_column].progress_apply(globals()[action])
    end = time.time()
    tqdm.write (f'Text cleanup finished in {end-start:.2f} seconds.\n')
    return df

In [33]:
actions = [action.strip().lower() for action in config['Text Cleaning']['actions'].split(",")]

In [34]:
actions

['remove_accents',
 'remove_apostrophes',
 'remove_hashtags',
 'remove_numberwords',
 'remove_stopwords',
 'save_tokenized']

In [37]:
cleaned = clean_text(data, 'Tweet')

 13%|█▎        | 48428/382045 [00:00<00:01, 197390.05it/s]

Cleaning up Tweet texts...
.... remove_accents


100%|██████████| 382045/382045 [00:01<00:00, 295400.69it/s]
 15%|█▌        | 58446/382045 [00:00<00:01, 274704.01it/s]

.... remove_apostrophes


100%|██████████| 382045/382045 [00:01<00:00, 318650.21it/s]
  8%|▊         | 30740/382045 [00:00<00:02, 149178.82it/s]

.... remove_hashtags


 94%|█████████▍| 358959/382045 [00:02<00:00, 132090.89it/s]

KeyboardInterrupt: 

In [None]:
#cleaned.to_pickle('cleaned.data')



In [15]:
def _tokenize(text):
    new_text = []
    for word, tag in pos_tag(tknzr.tokenize(text)):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 's'] else None
        if wntag:  # remove verbs
            lemma = lmtzr.lemmatize(word, wntag)
            new_text.append(lemma)
    return new_text


def tokenize(df, text_column):
    print(f'Tokenizing Dataframe["{text_column}"].')
    start = time.time()
    #df['Unigrams'] = df[text_column].progress_apply(_tokenize)
    df['Unigrams'] = df[text_column].progress_apply(tknzr.tokenize)
    end = time.time()
    print(f'Dataframe["{text_column}"] tokenized in {end-start:.2f} seconds.\n')
    return df

In [18]:
tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
lmtzr = WordNetLemmatizer()
tokens = tokenize(cleaned, 'Tweet')

  0%|          | 317/645981 [00:00<03:32, 3038.09it/s]

Tokenizing Dataframe["Tweet"].


100%|██████████| 645981/645981 [00:53<00:00, 12147.03it/s]


Dataframe["Tweet"] tokenized in 53.44 seconds.



In [17]:
def remove_stopwords(input):
    stop_words = set(stopwords.words('english') + list(punctuation) + [' ', 'rt', '...', '-->', ']:', '}:'])
    output = [i for i in input if i not in stop_words]
    return output

def remove_extremewords(input):
    output = [i for i in input if (len(i)<20 and len(i)>1)]   
    return output

In [20]:
def clean_tokens(df):
    tqdm.write('Cleaning up tokens...')
    start = time.time()
    tqdm.write('.... removing stop words')
    df['Unigrams'] = df['Unigrams'].progress_apply(remove_stopwords)
    tqdm.write('.... removing extreme words')
    df['Unigrams'] = df['Unigrams'].progress_apply(remove_extremewords)
    end = time.time()
    tqdm.write (f'Tokens cleanup finished in {end-start:.2f} seconds.\n')
    return df

In [21]:
cleantokens = clean_tokens(tokens)

  0%|          | 649/645981 [00:00<04:44, 2265.09it/s]

Cleaning up tokens...
.... removing stop words


100%|██████████| 645981/645981 [02:06<00:00, 5119.29it/s]
  3%|▎         | 19606/645981 [00:00<00:03, 195979.82it/s]

.... removing extreme words


100%|██████████| 645981/645981 [00:03<00:00, 210588.14it/s]


Tokens cleanup finished in 129.56 seconds.



In [None]:
stops= set(stopwords.words('english') +
    ["".join(string.split()).split(',') 
        for string in [v for k, v in config.items('Stop Words')]])
    output = [i for i in input if i not in stops]

In [70]:
l=[]
[l.append(list) for list in ["".join(string.split()).split(',') 
        for string in [v for k, v in config.items('Stop Words')]]]

[None, None, None, None, None, None]

In [84]:
flat_list = [item for sublist in (stopwords.words('english') + ["".join(string.split()).split(',') 
        for string in [v for k, v in config.items('Stop Words')]]) for item in sublist]

In [90]:
ll = [stopwords.words('english')] + ["".join(string.split()).split(',') 
        for string in [v for k, v in config.items('Stop Words')]]
l = [item for sublist in ll for item in sublist]

In [95]:
set(l)

{"'cause",
 '):',
 '-->',
 '..',
 '...',
 '....',
 '/:',
 '4s',
 '5s',
 '6s',
 ':p',
 '<u+>',
 "Ha'ta",
 "I'd",
 "I'll",
 "I'm",
 "I'm'a",
 "I'm'o",
 "I've",
 "S'e",
 ']:',
 'a',
 'aapl',
 'aaron',
 'about',
 'above',
 'according',
 'acquisition',
 'actually',
 'adele',
 'adobe',
 'after',
 'again',
 'against',
 'ago',
 'ahmed',
 'ain',
 "ain't",
 'airbnb',
 'alan',
 'alec',
 'alex',
 'alexa',
 'alibaba',
 'alicia',
 'all',
 'already',
 'also',
 'am',
 'amazon',
 'amazons',
 'america',
 'american',
 "amn't",
 'an',
 'and',
 'andreesen',
 'andrew',
 'android',
 'andy',
 'angeles',
 'anniversary',
 'announce',
 'announced',
 'announcement',
 'announces',
 'annual',
 'another',
 'answers',
 'any',
 'anyone',
 'anytime',
 'aol',
 'appl',
 'apple',
 'apples',
 'apr',
 'april',
 'are',
 'aren',
 "aren't",
 'arent',
 'around',
 'arrive',
 'arrives',
 'as',
 'ashley',
 'ashton',
 'ask',
 'asks',
 'assange',
 'asus',
 'at',
 'atari',
 'audi',
 'aug',
 'august',
 'australia',
 'australian',
 'av

In [93]:
len(l)

988

In [94]:
l

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [28]:
x = cleantokens['Unigrams'][0]

In [31]:
[f'{tuple[0]} {tuple[1]}' for tuple in list(bigrams(x))]

['fox time',
 'time warner',
 'warner spat',
 'spat shows',
 'shows death',
 'death tv',
 'tv cannot',
 'cannot come',
 'come soon',
 'soon enough']

In [51]:
s = [tuple[0] for tuple in config.items('Stop Words')]

In [53]:
config["Stop Words"]["contractions"]

"\n'cause"

In [None]:
stop_words = set(stopwords.words('english') + contractions + named_entities + emojis + money + time_event + too_common)

In [32]:
df = cleantokens

In [None]:
def generateBigrams(df):
    tqdm.write('.... generating bigrams')
    start = time.time()
    df['Bigrams'] = df['Unigrams'].progress_apply(lambda x: [f'{tuple[0]} {tuple[1]}' for tuple in list(bigrams(x))])
    df['NumTokens']=df['Unigrams'].apply(len)
    df['NumBigrams']=df['Bigrams'].apply(len)
#     df=df[df['NumTokens']<40]
#     df=df[df['NumTokens']>1]
    end = time.time()
    tqdm.write (f'Tokens cleanup finished in {end-start:.2f} seconds.\n')
    return df

In [None]:

output = 'tokenized.data'
generateBigrams(df).to_pickle(output)