In [56]:
# coding: utf-8

import pandas as pd
import numpy as np
import datetime
import re
import unicodedata
import time
from tqdm import tqdm
import nltk
from nltk import word_tokenize, pos_tag
from nltk import bigrams
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import TweetTokenizer
from string import punctuation
from nltk.corpus import stopwords
from configparser import ConfigParser, ExtendedInterpolation

In [57]:
def setup():
    start = time.time()
    print ('Running setup...')
    tqdm.pandas()
    resources = ['taggers/averaged_perceptron_tagger', 'corpora/wordnet', 'corpora/stopwords', 'tokenizers/punkt']
    for path in resources:
        try:
            nltk.data.find(path)
        except LookupError:
            nltk.download(path.split('/')[1])
    end = time.time()
    print (f'Setup finished in {end-start:.2f} seconds.\n')

In [58]:
setup()

Running setup...
Setup finished in 0.03 seconds.



In [59]:
config = ConfigParser(
    inline_comment_prefixes="#;",
    interpolation=ExtendedInterpolation())
config.read('../config.ini')

['../config.ini']

In [60]:
input = config['General']['input_file']
output = config['Text Cleaning']['tokenized_file']
text_column = config['General']['input_file_text_column']

In [61]:
(input, output, text_column)

('./data/twitter.csv', './data/tokenized.data', 'Tweet')

In [62]:
input = '../data/twitter.csv'
output = '../data/tokenized.data'

In [63]:
tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
lmtzr = WordNetLemmatizer()

In [64]:
def days_to_date(srl_no):
    args = list(map(int, [number.strip() for number in config['General']['first_date'].split(',')]))
    first = datetime.datetime(args[0], args[1], args[2])
    days = int(srl_no-1)
    new_date = first + datetime.timedelta(days)
    return new_date

In [65]:
def read_data(filename):
    print('Reading data...')
    start = time.time()
    tweets = pd.read_csv(filename, encoding='latin-1', sep=';',header=0, names=['StringDate', 'Days', 'From', 'Tweet'])
    tweets = tweets.filter(items=['Days', 'From', 'Tweet'])
    if config.getboolean('General', 'convert_date'):
        tweets['Days'] = tweets['Days'].progress_apply(days_to_date)
    tweets.columns=['Date', 'From', 'Tweet']
    tweets['From']= tweets['From'].apply(lambda x: x.lower())
    tweets = tweets[~tweets['From']
        .isin(
            [source.strip().lower()
                for source in 
                    config['General']['exclude_sources'].split(",")])]
    tweets = tweets.set_index('Date', drop=True).sort_index()
    end = time.time()
    print (f'Data read in {end-start:.2f} seconds.\n')
    return tweets

In [66]:
input_data = read_data(input)

Reading data...


100%|██████████| 645981/645981 [00:11<00:00, 53945.58it/s]


Data read in 14.83 seconds.



In [67]:
def lower_case(text):
    return str(text).lower()
    
def remove_accents(text):
    text = unicodedata.normalize('NFD', str(text)).encode('ascii', 'ignore').decode("utf-8")
    return str(text)

def remove_apostrophes(text):
    text = re.sub(r"\'s", "", text)
    return text

def remove_hashtags(text):
    #hashtags and handles
    text = re.sub(r'\B(\#([0-9]|[a-zA-Z])+|\@([0-9]|[a-zA-Z])+\b)', '', text)
    return text
def remove_urls(text):
    text= re.sub(r'http\S+', '', text)
    return text

def remove_numberwords(text):
    text= re.sub(r'\b[0-9]+\b\s*', '', text)
    return text


def clean_text(df, text_column):
    tqdm.write(f'Cleaning up {text_column} texts...')
    start = time.time()
    actions = [action.strip().lower() for action in config['Text Cleaning']['actions'].split(",")]
    for action in actions:
        tqdm.write('.... ' + action)
        df[text_column] = df[text_column].progress_apply(globals()[action])
    end = time.time()
    tqdm.write(f'Text cleanup finished in {end-start:.2f} seconds.\n')
    return df

In [68]:
cleaned = clean_text(input_data, text_column)

  4%|▎         | 18593/518145 [00:00<00:02, 185928.40it/s]

Cleaning up Tweet texts...
.... lower_case


100%|██████████| 518145/518145 [00:01<00:00, 354816.76it/s]
 11%|█▏        | 58592/518145 [00:00<00:01, 268926.63it/s]

.... remove_accents


100%|██████████| 518145/518145 [00:01<00:00, 319550.24it/s]
 10%|█         | 52214/518145 [00:00<00:01, 246780.91it/s]

.... remove_apostrophes


100%|██████████| 518145/518145 [00:01<00:00, 294789.32it/s]
  5%|▌         | 26311/518145 [00:00<00:03, 125367.03it/s]

.... remove_hashtags


100%|██████████| 518145/518145 [00:04<00:00, 125530.48it/s]
  1%|▏         | 7087/518145 [00:00<00:07, 69536.79it/s]

.... remove_urls


100%|██████████| 518145/518145 [00:02<00:00, 193436.70it/s]
  1%|▏         | 7770/518145 [00:00<00:06, 77418.22it/s]

.... remove_numberwords


100%|██████████| 518145/518145 [00:03<00:00, 158507.49it/s]

Text cleanup finished in 15.41 seconds.






In [69]:
def tokenize(df, text_column):
    print(f'Tokenizing Dataframe["{text_column}"].')
    start = time.time()
    df['Unigrams'] = df[text_column].progress_apply(tknzr.tokenize)
    end = time.time()
    print(f'Dataframe["{text_column}"] tokenized in {end-start:.2f} seconds.\n')
    return df


def remove_stopwords(input, stops):
    output = [i for i in input if i not in stops]
    return output

def remove_extremewords(input, min, max):
    output = [i for i in input if (len(i)<=max and len(i)>=min)]
    return output


def clean_tokens(df):
    tqdm.write('Cleaning up tokens...')
    start = time.time()
    tqdm.write('.... removing extreme words')
    min = config['Text Cleaning'].getint('min_word_size') or 2
    max = config['Text Cleaning'].getint('max_word_size') or 20
    df['Unigrams'] = df['Unigrams'].progress_apply(lambda x: remove_extremewords(x, min, max))
    tqdm.write('.... removing stop words')
    ll = [stopwords.words('english') + list(punctuation)] + ["".join(string.split()).split(',') for string in [v for k, v in config.items('Stop Words')]]
    flat = [item for sublist in ll for item in sublist]
    stops = set(flat)
    df['Unigrams'] = df['Unigrams'].progress_apply(lambda x: remove_stopwords(input=x, stops=stops))
    tqdm.write('.... generating bigrams')
    df['Bigrams'] = df['Unigrams'].progress_apply(lambda x: [f'{tuple[0]} {tuple[1]}' for tuple in list(bigrams(x))])
    end = time.time()
    tqdm.write (f'Tokens cleanup finished in {end-start:.2f} seconds.\n')
    return df

In [70]:
tokenized = tokenize(cleaned, text_column)

  0%|          | 943/518145 [00:00<00:54, 9421.70it/s]

Tokenizing Dataframe["Tweet"].


100%|██████████| 518145/518145 [00:46<00:00, 11238.57it/s]

Dataframe["Tweet"] tokenized in 46.18 seconds.






In [71]:
tokenized

Unnamed: 0_level_0,From,Tweet,Unigrams
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,@sai,top objectively biggest tech stories of,"[top, objectively, biggest, tech, stories, of]"
2010-01-01,@guardiantech,silicon valley campaign seeks startup visa for...,"[silicon, valley, campaign, seeks, startup, vi..."
2010-01-01,@techcrunch,: my fifth annual list of the tech products i ...,"[:, my, fifth, annual, list, of, the, tech, pr..."
2010-01-01,@techcrunch,namebench: google % project to find the fastes...,"[namebench, :, google, %, project, to, find, t..."
2010-01-01,@techcrunch,six new years resolutions for apple and the ip...,"[six, new, years, resolutions, for, apple, and..."
2010-01-01,@techcrunch,ten technologies that will rock by,"[ten, technologies, that, will, rock, by]"
2010-01-01,@techcrunch,"hotel wifi should be a right, not a luxury by","[hotel, wifi, should, be, a, right, ,, not, a,..."
2010-01-01,@techcrunch,twitter and me! why it the only social media ...,"[twitter, and, me, !, why, it, the, only, soci..."
2010-01-01,@engadget,ben heck ps3 slim laptop pops up for sale on e...,"[ben, heck, ps3, slim, laptop, pops, up, for, ..."
2010-01-01,@engadget,predictions for ?,"[predictions, for, ?]"


In [72]:
clean_tokens(tokenized).to_pickle(output)

  0%|          | 0/518145 [00:00<?, ?it/s]

Cleaning up tokens...
.... removing extreme words


100%|██████████| 518145/518145 [00:03<00:00, 130251.20it/s]
  3%|▎         | 16832/518145 [00:00<00:02, 168316.95it/s]

.... removing stop words


100%|██████████| 518145/518145 [00:02<00:00, 190498.01it/s]
  4%|▍         | 23197/518145 [00:00<00:04, 109908.84it/s]

.... generating bigrams


100%|██████████| 518145/518145 [00:04<00:00, 109532.17it/s]


Tokens cleanup finished in 11.80 seconds.

