# 0. Importing libraries

In [1]:
import pandas as pd
import sys
from os import listdir,path
from nltk.stem import WordNetLemmatizer


In [2]:
#self defined functions
sys.path.insert(0, "../functions")
# from data_cleaning import remove_punctuations_urls, retrieve_hashtag
import data_cleaning as dc


# 1. Reading data

In [3]:
mlb = []
for file in listdir("../data"):
    if "twitter_MLB_202" in file:
        df = pd.read_csv(path.join("../data",file))
        mlb.append(df)

In [4]:
mlb[0].head()

Unnamed: 0,Datetime,Tweet Id,Text,Username,Reply Count,Retweet Count,Like Count,tweetquote Count,conversation ID,source,retweeted Tweet,quotedTweet,mentioned Users,language
0,2021-12-30 23:59:47+00:00,1476704601424465921,That's is if we're allowed #Baseball ever agai...,LittlePickles9,0,0,0,0,1476704601424465921,"<a href=""http://twitter.com/download/android"" ...",,https://twitter.com/LittlePickles9/status/1476...,,en
1,2021-12-30 23:54:30+00:00,1476703271221878793,Joining @RR_RicardoFAN next is @dcone36 to dis...,CBSSportsRadio,0,2,6,0,1476703271221878793,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,"[User(username='RR_RicardoFAN', id=23912945, d...",en
2,2021-12-30 23:34:26+00:00,1476698220176814081,When do you think the MLB Lockout is over?,NYYSportsfan96,0,0,0,0,1476698220176814081,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,en
3,2021-12-30 23:24:16+00:00,1476695662930501633,WHEN IS THE MLB LOCKOUT GOING TO BE OVER ‼️‼️‼...,itstitoo_,0,0,0,0,1476695662930501633,"<a href=""http://twitter.com/download/android"" ...",,,,en
4,2021-12-30 23:20:33+00:00,1476694729270071302,"MLB lockout mailbag: Major sticking points, po...",JenniferHeiner_,0,0,0,0,1476694729270071302,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",,,,en


# 2. Tidying the data

## 2.1 Limiting to only english tweets

In [5]:
eng_only = []
for df in mlb:
    eng_only.append(df[df.language == "en"])

## 2.2 Focusing on only text

In [6]:
texts = []
for month in eng_only:
    texts += [text for text in month.Text]

In [7]:
len(texts)

41243

In [8]:
eng_only[0].iloc[1].Text

'Joining @RR_RicardoFAN next is @dcone36 to discuss when the MLB lockout needs to end to get the season started on time, get his reaction to a few offseason signings &amp; more!\n\nListen Live: https://t.co/d1K0UWkr6D https://t.co/98wvhsGnZN'

## 2.3 tidy text

1. Transform all text to lower case
2. Remove urls and punctuation marks, including 
    - urls
    - extract hashtag marks # (stored apart for potential use, the hashtag itself will be removed later )
    - extract mention marks @ (stored apart for potential use, the hashtag itself will be removed later)
    - all remaining punctuation marks
3. Remove stop words
4. Lemmatize the words
4. Transform strings into (set) vectors of words

### 2.3.1 Text to lower case

In [9]:
lower_case_text = [text.lower() for text in texts]

### 2.3.2 Remove punctuations and url

**2.3.2.1 Remove URLs**

In [10]:
# test
print (lower_case_text[0])
print(dc.remove_urls(lower_case_text[0]))

that's is if we're allowed #baseball ever again...
#mlb #lockout https://t.co/wyucn4jej8
that's is if we're allowed #baseball ever again...
#mlb #lockout  


In [11]:
no_urls = [dc.remove_urls(text) for text in lower_case_text]
no_urls[1]

'joining @rr_ricardofan next is @dcone36 to discuss when the mlb lockout needs to end to get the season started on time, get his reaction to a few offseason signings &amp; more!\n\nlisten live:    '

**2.3.2.2 Extract hashtags and mentions**

In [12]:
hashtags = [dc.retrieve_hashtag(text) for text in no_urls]
mentions = [dc.retrieve_mentions(text) for text in no_urls]


In [13]:
print(hashtags[0])
print(mentions[1])

['#baseball', '#mlb', '#lockout']
['@rr_ricardofan', '@dcone36']


**2.3.2.3 Remove other punctuations**

In [14]:
#test
print(no_urls[0])
print(dc.remove_punctuations(no_urls[0]))

that's is if we're allowed #baseball ever again...
#mlb #lockout  
that is if we allowed baseball ever again mlb lockout


In [15]:
no_punctuations = [dc.remove_punctuations(text) for text in no_urls]


In [16]:
no_punctuations[:5]

['that is if we allowed baseball ever again mlb lockout',
 'joining rr_ricardofan next is dcone36 to discuss when the mlb lockout needs to end to get the season started on time get his reaction to a few offseason signings more listen live',
 'when do you think the mlb lockout is over',
 'when is the mlb lockout going to be over',
 'mlb lockout mailbag major sticking points potential spring training delays minor players and more']

### 2.3.3 Removing Stop Words

In [17]:
no_punctuations[0]

'that is if we allowed baseball ever again mlb lockout'

In [18]:
dc.remove_stopwords(no_punctuations[0])

'allowed baseball ever mlb lockout'

In [19]:
cleaned = [dc.remove_stopwords(text) for text in no_punctuations]

In [20]:
cleaned[:5]

['allowed baseball ever mlb lockout',
 'joining rr_ricardofan next dcone36 discuss mlb lockout needs end get season started time get reaction offseason signings listen live',
 'think mlb lockout',
 'mlb lockout going',
 'mlb lockout mailbag major sticking points potential spring training delays minor players']

### 2.3.4 Stemming and lemmatizing words

In [21]:
print(cleaned[0])
print(dc.lemmatize(cleaned[0]))


allowed baseball ever mlb lockout
allow baseball ever mlb lockout


In [22]:
lemmatized =  [dc.lemmatize(text) for text in cleaned]


In [23]:
lemmatized[:5]

['allow baseball ever mlb lockout',
 'join rr_ricardofan next dcone36 discuss mlb lockout need end get season start time get reaction offseason sign listen live',
 'think mlb lockout',
 'mlb lockout go',
 'mlb lockout mailbag major stick point potential spring train delay minor players']

### 2.3.5  Transform strings into (set) vectors of words

In [24]:
text_sets_list = [list(set(text.split())) for text in lemmatized]


In [25]:
text_sets_list[:5]

[['ever', 'baseball', 'lockout', 'allow', 'mlb'],
 ['need',
  'join',
  'discuss',
  'start',
  'time',
  'season',
  'reaction',
  'lockout',
  'offseason',
  'rr_ricardofan',
  'next',
  'dcone36',
  'mlb',
  'get',
  'sign',
  'end',
  'listen',
  'live'],
 ['lockout', 'think', 'mlb'],
 ['lockout', 'mlb', 'go'],
 ['point',
  'train',
  'stick',
  'delay',
  'mailbag',
  'potential',
  'lockout',
  'major',
  'spring',
  'players',
  'mlb',
  'minor']]

# 3 Store tidied data

In [26]:
tidied_text_added_df = pd.concat([eng_only[i] for i in range(4)])

In [27]:
len(tidied_text_added_df)

41243

In [28]:
tidied_text_added_df["hashtag"] = hashtags
tidied_text_added_df["mentioned"] = mentions
tidied_text_added_df["cleaned_text"] = cleaned
tidied_text_added_df["cleaned_text_list"] = text_sets_list


In [29]:
tidied_text_added_df[["Text","mentioned","hashtag","cleaned_text","cleaned_text_list"]].head()

Unnamed: 0,Text,mentioned,hashtag,cleaned_text,cleaned_text_list
0,That's is if we're allowed #Baseball ever agai...,[],"[#baseball, #mlb, #lockout]",allowed baseball ever mlb lockout,"[ever, baseball, lockout, allow, mlb]"
1,Joining @RR_RicardoFAN next is @dcone36 to dis...,"[@rr_ricardofan, @dcone36]",[],joining rr_ricardofan next dcone36 discuss mlb...,"[need, join, discuss, start, time, season, rea..."
2,When do you think the MLB Lockout is over?,[],[],think mlb lockout,"[lockout, think, mlb]"
3,WHEN IS THE MLB LOCKOUT GOING TO BE OVER ‼️‼️‼...,[],[],mlb lockout going,"[lockout, mlb, go]"
4,"MLB lockout mailbag: Major sticking points, po...",[],[],mlb lockout mailbag major sticking points pote...,"[point, train, stick, delay, mailbag, potentia..."


In [30]:
path = os.path.abspath("../data/tidied_text_added.csv")
tidied_text_added_df.to_csv(path)