# Load and explore

In [7]:
import pandas as pd
tweets_en_df = pd.read_csv("tweets_sample_en.csv", sep='\t')

In [8]:
tweets_en_df.head()

Unnamed: 0.1,Unnamed: 0,created_at,text,id
0,0,Tue Jun 16 16:30:49 +0000 2020,20 After life üòÅ #IndiaChinaFaceOff,1272929635551109120
1,1,Tue Jun 16 16:30:49 +0000 2020,RT @lewis_goodall: Govt has partly defended it...,1272929635538481154
2,2,Tue Jun 16 16:30:49 +0000 2020,"RT @teachaged: So, my sister and I are 59 toda...",1272929635546742785
3,3,Tue Jun 16 16:30:49 +0000 2020,RT @lovelyluupin: this video enrages me i hate...,1272929635542593536
4,4,Tue Jun 16 16:30:49 +0000 2020,RT @relic_crusher: a couple of cool down weddi...,1272929635550900225


In [9]:
tweets_en_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  150001 non-null  int64 
 1   created_at  150001 non-null  object
 2   text        150001 non-null  object
 3   id          150001 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 4.6+ MB


In [10]:
tweets_en_df['created_at'] = pd.to_datetime(tweets_en_df['created_at'])

In [11]:
tweets_en_df.describe()

Unnamed: 0.1,Unnamed: 0,id
count,150001.0,150001.0
mean,75000.0,1.272949e+18
std,43301.703202,8897664000000.0
min,0.0,1.27293e+18
25%,37500.0,1.272942e+18
50%,75000.0,1.272949e+18
75%,112500.0,1.272956e+18
max,150000.0,1.272965e+18


In [12]:
tweets_en_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype              
---  ------      --------------   -----              
 0   Unnamed: 0  150001 non-null  int64              
 1   created_at  150001 non-null  datetime64[ns, UTC]
 2   text        150001 non-null  object             
 3   id          150001 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(2), object(1)
memory usage: 4.6+ MB


Let¬¥s take only the text column and start the cleaning.

In [24]:
tweets_text = tweets_en_df['text']

In [25]:
tweets_text.head()

0                   20 After life üòÅ #IndiaChinaFaceOff
1    RT @lewis_goodall: Govt has partly defended it...
2    RT @teachaged: So, my sister and I are 59 toda...
3    RT @lovelyluupin: this video enrages me i hate...
4    RT @relic_crusher: a couple of cool down weddi...
Name: text, dtype: object

# Cleaning the text

In [26]:
import re
from nltk.corpus import stopwords

In [27]:
# remove all RT @<nickname>
regex_pat = re.compile(r'RT @[\w]+:')
tweets_text = tweets_text.str.replace(regex_pat, ' ')
tweets_text.head()

0                   20 After life üòÅ #IndiaChinaFaceOff
1      Govt has partly defended its schools policy ...
2      So, my sister and I are 59 today (yep we‚Äôre ...
3      this video enrages me i hate it\n\n https://...
4      a couple of cool down wedding dress doodles ...
Name: text, dtype: object

In [28]:
# remove emails
regex_pat = re.compile(r'\S*@\S*\s?')
tweets_text = tweets_text.str.replace(regex_pat, ' ')

In [29]:
# remove urls
regex_pat = re.compile(r'https?:\/\/.*[\r\n]*')
tweets_text = tweets_text.str.replace(regex_pat, ' ')
tweets_text.head(10)

0                   20 After life üòÅ #IndiaChinaFaceOff
1      Govt has partly defended its schools policy ...
2      So, my sister and I are 59 today (yep we‚Äôre ...
3                this video enrages me i hate it\n\n  
4      a couple of cool down wedding dress doodles ...
5      Jesus!  says he was only aware of the free s...
6     #misinformation from the worst #CNNisFakeNews...
7      Germaphobia is a MENTAL ILLNESS. We urgently...
8      Rt‚Äôing this because I feel like black people...
9      I deserved Every L I ever took ... but I lea...
Name: text, dtype: object

In [31]:
# convert to lower case
tweets_text = tweets_text.str.lower()
tweets_text.head(10)

0                   20 after life üòÅ #indiachinafaceoff
1      govt has partly defended its schools policy ...
2      so, my sister and i are 59 today (yep we‚Äôre ...
3                this video enrages me i hate it\n\n  
4      a couple of cool down wedding dress doodles ...
5      jesus!  says he was only aware of the free s...
6     #misinformation from the worst #cnnisfakenews...
7      germaphobia is a mental illness. we urgently...
8      rt‚Äôing this because i feel like black people...
9      i deserved every l i ever took ... but i lea...
Name: text, dtype: object

# Remove stopwords

In [36]:
# keep only words
regex_pat = re.compile(r'[^a-zA-Z\s]', flags=re.IGNORECASE)
tweets_text = tweets_text.str.replace(regex_pat, '')
tweets_text.head(10)

0                        after life  indiachinafaceoff
1      govt has partly defended its schools policy ...
2      so my sister and i are  today yep were twins...
3                this video enrages me i hate it\n\n  
4      a couple of cool down wedding dress doodles ...
5      jesus  says he was only aware of the free sc...
6     misinformation from the worst cnnisfakenews w...
7      germaphobia is a mental illness we urgently ...
8      rting this because i feel like black people ...
9      i deserved every l i ever took  but i learne...
Name: text, dtype: object

In [37]:
tweets_text = tweets_text.apply(lambda x: x.split())
tweets_text.head(10)

0                     [after, life, indiachinafaceoff]
1    [govt, has, partly, defended, its, schools, po...
2    [so, my, sister, and, i, are, today, yep, were...
3              [this, video, enrages, me, i, hate, it]
4    [a, couple, of, cool, down, wedding, dress, do...
5    [jesus, says, he, was, only, aware, of, the, f...
6    [misinformation, from, the, worst, cnnisfakene...
7    [germaphobia, is, a, mental, illness, we, urge...
8    [rting, this, because, i, feel, like, black, p...
9    [i, deserved, every, l, i, ever, took, but, i,...
Name: text, dtype: object

In [38]:
stopword_set = set(stopwords.words("english"))
# remove stopwords
tweets_text = tweets_text.apply(lambda x: [item for item in x if item not in stopword_set])
tweets_text.head(10)

0                            [life, indiachinafaceoff]
1    [govt, partly, defended, schools, policy, sayi...
2    [sister, today, yep, twins, identical, look, b...
3                               [video, enrages, hate]
4         [couple, cool, wedding, dress, doodles, uuu]
5    [jesus, says, aware, free, school, meals, camp...
6    [misinformation, worst, cnnisfakenews, protest...
7    [germaphobia, mental, illness, urgently, need,...
8    [rting, feel, like, black, people, die, police...
9    [deserved, every, l, ever, took, learned, mist...
Name: text, dtype: object

Now your dataset is ready to be used for any NLP model. You might need to tokenize or apply any other NLP preprocessing steps depending on the final application.