# Data Cleaning 2

### Import Dependencies

In [1]:
import pandas as pd
import re
import string

import pickle # just in case

### Set file locations

In [2]:
# Raw data file for tweet text (input file)
tweet_text_file = '../00_data/twitter_data/twitter_text.csv'

# File for cleaned tweet text (input file)
clean_csv = '../00_data/twitter_data/twitter_data_cleaned.csv'

# File for cleaned tweet text (output file)
cleantext_csv = '../00_data/twitter_data/twitter_cleantext.csv'

### Read in the data

In [3]:
df = pd.read_csv(clean_csv)

In [4]:
df.head()

Unnamed: 0,tweet_id,created_at,full_text,geo,coordinates,place,retweet_count,favorite_count,possibly_sensitive,lang,user_id,created_at_datetime
0,1.364223e+18,Tue Feb 23 14:38:16 +0000 2021,Here’s what's in the COVID relief package:\n \...,,,,9160,38093,NOT FOUND,en,29501250.0,2021-02-23 14:38:16+00:00
1,1.364381e+18,Wed Feb 24 01:07:52 +0000 2021,Will the National Endowment for the Arts be he...,,,,6131,18560,NOT FOUND,en,1.201671e+18,2021-02-24 01:07:52+00:00
2,1.36461e+18,Wed Feb 24 16:14:15 +0000 2021,"This is both anecdotal and early, but many lon...",,,,5941,63174,NOT FOUND,en,38428720.0,2021-02-24 16:14:15+00:00
3,1.364727e+18,Wed Feb 24 23:59:58 +0000 2021,A Link to Professor Chossudovsky’s Analysis of...,,,,1,0,False,en,2192010000.0,2021-02-24 23:59:58+00:00
4,1.364727e+18,Wed Feb 24 23:59:58 +0000 2021,Children warned over hugging grandparents even...,,,,0,2,False,en,2868190000.0,2021-02-24 23:59:58+00:00


In [5]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51157 entries, 0 to 51156
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   tweet_id             51157 non-null  float64
 1   created_at           51157 non-null  object 
 2   full_text            51157 non-null  object 
 3   geo                  77 non-null     object 
 4   coordinates          77 non-null     object 
 5   place                1005 non-null   object 
 6   retweet_count        51157 non-null  int64  
 7   favorite_count       51157 non-null  int64  
 8   possibly_sensitive   51157 non-null  object 
 9   lang                 51157 non-null  object 
 10  user_id              51157 non-null  float64
 11  created_at_datetime  51157 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 4.7+ MB


### Preparing text for analysis

The general goal here is to get a bag of words for each tweet. 

In [6]:
# Create a new dataframe.
text_df = df[['tweet_id', 'full_text']].copy()
text_df.set_index('tweet_id')

Unnamed: 0_level_0,full_text
tweet_id,Unnamed: 1_level_1
1.364223e+18,Here’s what's in the COVID relief package:\n \...
1.364381e+18,Will the National Endowment for the Arts be he...
1.364610e+18,"This is both anecdotal and early, but many lon..."
1.364727e+18,A Link to Professor Chossudovsky’s Analysis of...
1.364727e+18,Children warned over hugging grandparents even...
...,...
1.366553e+18,EU - GERMANY ....and the beat goes on.... Wher...
1.366549e+18,@satyendrajain @ArvindKejriwal @DrKKAggarwal ...
1.366547e+18,Do it the Coward and Liar way. Get vaccinated ...
1.366544e+18,Trump urges supporters to get vaccinated again...


In [7]:
# Define a little cleaner function
# I would really like to get some review on the regex here.

def clean_text_round1(text):
    '''Remove hashtags, remove urls, remove twitter handles, make text lowercase, remove punctuation, excess whitespace (in that order).'''
    # remove hashtags (possible issue if hashtags are used as words, e.g. 'We should get #VACCINCATED')
    text = re.sub('#(\w+)', '', text)
    # remove urls of the form general form https://t.co/yXdx5kVNYV
    text = re.sub('https:\/\/t\.co\/([a-zA-Z0-9]+)', '', text)
    # remove twitter handles
    text = re.sub('(RT )?@[A-Za-z0-9_]+', '', text)
    # make text lowercase
    text = text.lower()
    # remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # remove multiple whitespace, and convert all whitespace to space (' ').
    text = " ".join(text.split())
    
    return text

In [8]:
text_df['clean_text'] = text_df['full_text'].apply(lambda x: clean_text_round1(x))

In [9]:
text_df

Unnamed: 0,tweet_id,full_text,clean_text
0,1.364223e+18,Here’s what's in the COVID relief package:\n \...,here’s whats in the covid relief package direc...
1,1.364381e+18,Will the National Endowment for the Arts be he...,will the national endowment for the arts be he...
2,1.364610e+18,"This is both anecdotal and early, but many lon...",this is both anecdotal and early but many long...
3,1.364727e+18,A Link to Professor Chossudovsky’s Analysis of...,a link to professor chossudovsky’s analysis of...
4,1.364727e+18,Children warned over hugging grandparents even...,children warned over hugging grandparents even...
...,...,...,...
51152,1.366553e+18,EU - GERMANY ....and the beat goes on.... Wher...,eu germany and the beat goes on where are the ...
51153,1.366549e+18,@satyendrajain @ArvindKejriwal @DrKKAggarwal ...,vaccination of frontline officers is being del...
51154,1.366547e+18,Do it the Coward and Liar way. Get vaccinated ...,do it the coward and liar way get vaccinated a...
51155,1.366544e+18,Trump urges supporters to get vaccinated again...,trump urges supporters to get vaccinated again...


In [10]:
text_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51157 entries, 0 to 51156
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   tweet_id    51157 non-null  float64
 1   full_text   51157 non-null  object 
 2   clean_text  51157 non-null  object 
dtypes: float64(1), object(2)
memory usage: 1.2+ MB


In [11]:
text_df.to_csv(cleantext_csv, index = False)