In [1]:
import json
import re
from collections import Counter
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from time import time
import nltk
import string
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\feder\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\feder\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
data = pd.read_json("data.json")
data_out= pd.read_json("data.json")

In [3]:
len(data)

4224

In [4]:
data.isnull().values.any()

False

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4224 entries, 0 to 4223
Data columns (total 7 columns):
created_at        4224 non-null datetime64[ns]
favorite_count    4224 non-null int64
id_str            4224 non-null int64
is_retweet        4224 non-null bool
retweet_count     4224 non-null int64
source            4224 non-null object
text              4224 non-null object
dtypes: bool(1), datetime64[ns](1), int64(3), object(2)
memory usage: 202.2+ KB


In [6]:
%%time

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone','user',
        'time', 'date', 'number'],
    # terms that will be annotated
    annotate={"elongated", "repeated"},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated word
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=False).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

normalized_text = []

for s in data['text']:
    normalized_text.append(" ".join(text_processor.pre_process_doc(s)))
normalized_text_series = pd.Series(normalized_text)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...
Wall time: 12.4 s


In [7]:
data['normalized_text'] = [t for t in normalized_text_series]

In [8]:
data = data.drop(["created_at","favorite_count","id_str","is_retweet","source","retweet_count"],axis=1)

In [9]:
notag_tweets = []
for t in data.normalized_text:
    notag_tweets.append(re.sub(r'(<[aA-zZ]+>)', '',t))

data['text_no_tag'] = [t for t in notag_tweets]

In [10]:
nopun_tweets = []
for t in data.text_no_tag:
    nopun_tweets.append(re.sub(r'[^\w\s]', '',t))

data['text_no_pun'] = [t for t in nopun_tweets]

In [11]:
data.head()

Unnamed: 0,text,normalized_text,text_no_tag,text_no_pun
0,Russians are playing @CNN and @NBCNews for suc...,Russians are playing <user> and <user> for suc...,Russians are playing and for such fools - fu...,Russians are playing and for such fools fun...
1,"Join @AmerIcan32, founded by Hall of Fame lege...","Join <user> , founded by Hall of Fame legend <...","Join , founded by Hall of Fame legend on in...",Join founded by Hall of Fame legend on in ...
2,Great move on delay (by V. Putin) - I always k...,Great move on delay ( by V . Putin ) - I alway...,Great move on delay ( by V . Putin ) - I alway...,Great move on delay by V Putin I always kn...
3,My Administration will follow two simple rules...,My Administration will follow two simple rules...,My Administration will follow two simple rules :,My Administration will follow two simple rules
4,'Economists say Trump delivered hope' https://...,' Economists say Trump delivered hope ' <url>,' Economists say Trump delivered hope ',Economists say Trump delivered hope


In [12]:
data['length'] = [len(t) for t in data.text_no_tag]
data.head()

Unnamed: 0,text,normalized_text,text_no_tag,text_no_pun,length
0,Russians are playing @CNN and @NBCNews for suc...,Russians are playing <user> and <user> for suc...,Russians are playing and for such fools - fu...,Russians are playing and for such fools fun...,105
1,"Join @AmerIcan32, founded by Hall of Fame lege...","Join <user> , founded by Hall of Fame legend <...","Join , founded by Hall of Fame legend on in...",Join founded by Hall of Fame legend on in ...,71
2,Great move on delay (by V. Putin) - I always k...,Great move on delay ( by V . Putin ) - I alway...,Great move on delay ( by V . Putin ) - I alway...,Great move on delay by V Putin I always kn...,72
3,My Administration will follow two simple rules...,My Administration will follow two simple rules...,My Administration will follow two simple rules :,My Administration will follow two simple rules,49
4,'Economists say Trump delivered hope' https://...,' Economists say Trump delivered hope ' <url>,' Economists say Trump delivered hope ',Economists say Trump delivered hope,40


In [13]:
data["text_lower"] = data.text_no_pun.str.lower().str.split()

In [14]:
x_data= data["text_lower"]
sw = stopwords.words('english')
#words.add('putin') #change this to get more stuff to keep

l = nltk.stem.WordNetLemmatizer()
#test_tweet=x_data[1]


def remove_stop_lemmatiz(tweet):
    part= [word for word in tweet if word not in stopwords.words('english')]
    part=[l.lemmatize(word) for word in part] #comment out if not want to remove "no english words". 
    #part= [word for word in part if word in words]
    return part 

def remove_custom_words(tweet,wordlist):
    return [word for word in tweet if word in wordlist]



x_data_cleaned=[remove_stop_lemmatiz(tweet) for tweet in x_data ]  
superstring=''

for i in range(len(x_data_cleaned)):
    superstring=superstring+' '.join(x_data_cleaned[i])+' '


words_list= Counter(superstring.split()).most_common()
words_to_use=set(Counter(superstring.split()).most_common())

for i in range(len(words_list)):
    if(words_list[i][1]>150 or words_list[i][1]<2):
        words_to_use.remove(words_list[i])
sorted_by_second = sorted(words_to_use, key=lambda tup: -tup[1])        
final_word_allowed_set=set([i[0] for i in sorted_by_second])

x_data_cleaned_2=[remove_custom_words(tweet,final_word_allowed_set) for tweet in x_data_cleaned]


In [15]:
new=[' '.join(tweet) for tweet in x_data_cleaned_2]
df_in=pd.DataFrame(new)
df_in.columns=['text_pp']
b=pd.concat((data_out,df_in), axis=1)
b.to_csv('preprocess.csv')