In [1]:
# coding: utf-8

import pandas as pd
import numpy as np
import datetime
import re
import unicodedata
import time
from tqdm import tqdm
import nltk
from nltk import word_tokenize, pos_tag
from nltk import bigrams
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import TweetTokenizer
from string import punctuation
from nltk.corpus import stopwords
from configparser import ConfigParser, ExtendedInterpolation

In [2]:
def setup():
    start = time.time()
    print ('Running setup...')
    tqdm.pandas()
    resources = ['taggers/averaged_perceptron_tagger', 'corpora/wordnet', 'corpora/stopwords', 'tokenizers/punkt']
    for path in resources:
        try:
            nltk.data.find(path)
        except LookupError:
            nltk.download(path.split('/')[1])
    end = time.time()
    print (f'Setup finished in {end-start:.2f} seconds.\n')

In [3]:
setup()

Running setup...
Setup finished in 0.00 seconds.



In [6]:
config = ConfigParser(
    inline_comment_prefixes="#;",
    interpolation=ExtendedInterpolation())
config.read('../config.ini')

['../config.ini']

In [7]:
input = config['General']['input_file']
output = config['Text Cleaning']['tokenized_file']
text_column = config['General']['input_file_text_column']

In [9]:
(input, output, text_column)

('./data/twitter.csv', './data/tokenized.data', 'Tweet')

In [10]:
input = '../data/twitter.csv'
output = '../data/tokenized.data'

In [11]:
tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
lmtzr = WordNetLemmatizer()

In [13]:
def days_to_date(srl_no):
    args = list(map(int, [number.strip() for number in config['General']['first_date'].split(',')]))
    first = datetime.datetime(args[0], args[1], args[2])
    days = int(srl_no-1)
    new_date = first + datetime.timedelta(days)
    return new_date

In [14]:
def read_data(filename):
    print('Reading data...')
    start = time.time()
    tweets = pd.read_csv(filename, encoding='latin-1', sep=';',header=0, names=['StringDate', 'Days', 'From', 'Tweet'])
    tweets = tweets.filter(items=['Days', 'From', 'Tweet'])
    if config.getboolean('General', 'convert_date'):
        tweets['Days'] = tweets['Days'].progress_apply(days_to_date)
    tweets.columns=['Date', 'From', 'Tweet']
    tweets = tweets[~tweets['From']
        .isin(
            [source.strip().lower()
                for source in 
                    config['General']['exclude_sources'].split(",")])]
    tweets = tweets.reset_index()
    tweets = tweets.set_index('Date').sort_index()
    end = time.time()
    print (f'Data read in {end-start:.2f} seconds.\n')
    return tweets

In [15]:
input_data = read_data(input)

Reading data...


100%|██████████| 645981/645981 [00:10<00:00, 59497.24it/s]


Data read in 12.63 seconds.



In [17]:
def remove_accents(text):
    text = unicodedata.normalize('NFD', str(text)).encode('ascii', 'ignore').decode("utf-8").lower()
    return str(text)

def remove_apostrophes(text):
    text = re.sub(r"\'s", "", text)
    return text

def remove_hashtags(text):
    #hashtags and handles
    text = re.sub(r'\B(\#([0-9]|[a-zA-Z])+|\@([0-9]|[a-zA-Z])+\b)', '', text)
    return text
def remove_urls(text):
    text= re.sub(r'http\S+', '', text)
    return text

def remove_numberwords(text):
    text= re.sub(r'\b[0-9]+\b\s*', '', text)
    return text


def clean_text(df, text_column):
    tqdm.write(f'Cleaning up {text_column} texts...')
    start = time.time()
    actions = [action.strip().lower() for action in config['Text Cleaning']['actions'].split(",")]
    for action in actions:
        tqdm.write('.... ' + action)
        df[text_column] = df[text_column].progress_apply(globals()[action])
    end = time.time()
    tqdm.write(f'Text cleanup finished in {end-start:.2f} seconds.\n')
    return df

In [18]:
cleaned = clean_text(input_data, text_column)

  9%|▊         | 44190/518145 [00:00<00:02, 174830.93it/s]

Cleaning up Tweet texts...
.... remove_accents


100%|██████████| 518145/518145 [00:01<00:00, 287314.86it/s]
 10%|█         | 54387/518145 [00:00<00:01, 258068.34it/s]

.... remove_apostrophes


100%|██████████| 518145/518145 [00:01<00:00, 313792.25it/s]
  2%|▏         | 8653/518145 [00:00<00:05, 86527.19it/s]

.... remove_hashtags


100%|██████████| 518145/518145 [00:03<00:00, 134350.21it/s]
  9%|▉         | 48836/518145 [00:00<00:02, 233293.48it/s]

.... remove_urls


100%|██████████| 518145/518145 [00:02<00:00, 248614.07it/s]
  3%|▎         | 15140/518145 [00:00<00:03, 151396.90it/s]

.... remove_numberwords


100%|██████████| 518145/518145 [00:02<00:00, 172839.34it/s]

Text cleanup finished in 12.68 seconds.






In [20]:
def tokenize(df, text_column):
    print(f'Tokenizing Dataframe["{text_column}"].')
    start = time.time()
    df['Unigrams'] = df[text_column].progress_apply(tknzr.tokenize)
    end = time.time()
    print(f'Dataframe["{text_column}"] tokenized in {end-start:.2f} seconds.\n')
    return df


def remove_stopwords(input, stops):
    output = [i for i in input if i not in stops]
    return output

def remove_extremewords(input, min, max):
    output = [i for i in input if (len(i)<=max and len(i)>=min)]
    return output


def clean_tokens(df):
    tqdm.write('Cleaning up tokens...')
    start = time.time()
    tqdm.write('.... removing extreme words')
    min = config['Text Cleaning'].getint('min_word_size') or 2
    max = config['Text Cleaning'].getint('max_word_size') or 20
    df['Unigrams'] = df['Unigrams'].progress_apply(lambda x: remove_extremewords(x, min, max))
    tqdm.write('.... removing stop words')
    ll = [stopwords.words('english') + list(punctuation)] + ["".join(string.split()).split(',') for string in [v for k, v in config.items('Stop Words')]]
    flat = [item for sublist in ll for item in sublist]
    stops = set(flat)
    df['Unigrams'] = df['Unigrams'].progress_apply(lambda x: remove_stopwords(input=x, stops=stops))
    tqdm.write('.... generating bigrams')
    df['Bigrams'] = df['Unigrams'].progress_apply(lambda x: [f'{tuple[0]} {tuple[1]}' for tuple in list(bigrams(x))])
    end = time.time()
    tqdm.write (f'Tokens cleanup finished in {end-start:.2f} seconds.\n')
    return df

In [22]:
tokenized = tokenize(cleaned, text_column)

  0%|          | 702/518145 [00:00<01:13, 7018.25it/s]

Tokenizing Dataframe["Tweet"].


100%|██████████| 518145/518145 [00:46<00:00, 11200.62it/s]


Dataframe["Tweet"] tokenized in 46.42 seconds.



In [23]:
tokenized

Unnamed: 0_level_0,index,From,Tweet,Unigrams,Bigrams
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-01,65285,@SAI,top objectively biggest tech stories of,"[top, objectively, biggest, tech, stories, of]","[objectively tech, tech stories]"
2010-01-01,195166,@guardiantech,silicon valley campaign seeks startup visa for...,"[silicon, valley, campaign, seeks, startup, vi...","[silicon campaign, campaign seeks, seeks start..."
2010-01-01,502284,@TechCrunch,: my fifth annual list of the tech products i ...,"[:, my, fifth, annual, list, of, the, tech, pr...","[fifth list, list tech, tech products, product..."
2010-01-01,502285,@TechCrunch,namebench: google % project to find the fastes...,"[namebench, :, google, %, project, to, find, t...","[namebench project, project find, find fastest..."
2010-01-01,502286,@TechCrunch,six new years resolutions for apple and the ip...,"[six, new, years, resolutions, for, apple, and...",[six resolutions]
2010-01-01,502287,@TechCrunch,ten technologies that will rock by,"[ten, technologies, that, will, rock, by]","[ten technologies, technologies rock]"
2010-01-01,502288,@TechCrunch,"hotel wifi should be a right, not a luxury by","[hotel, wifi, should, be, a, right, ,, not, a,...","[hotel wifi, wifi right, right luxury]"
2010-01-01,502289,@TechCrunch,twitter and me! why it the only social media ...,"[twitter, and, me, !, why, it, the, only, soci...","[social media, media tool, tool use]"
2010-01-01,342891,@engadget,ben heck ps3 slim laptop pops up for sale on e...,"[ben, heck, ps3, slim, laptop, pops, up, for, ...","[ben heck, heck ps3, ps3 slim, slim laptop, la..."
2010-01-01,342892,@engadget,predictions for ?,"[predictions, for, ?]",[]


In [24]:
clean_tokens(tokenized).to_pickle(output)

  2%|▏         | 9023/518145 [00:00<00:05, 88132.42it/s]

Cleaning up tokens...
.... removing extreme words


100%|██████████| 518145/518145 [00:03<00:00, 146924.72it/s]
  8%|▊         | 43987/518145 [00:00<00:02, 204880.37it/s]

.... removing stop words


100%|██████████| 518145/518145 [00:02<00:00, 180325.95it/s]
  5%|▍         | 25065/518145 [00:00<00:04, 118792.03it/s]

.... generating bigrams


100%|██████████| 518145/518145 [00:05<00:00, 89224.03it/s] 


Tokens cleanup finished in 12.78 seconds.

