<a href="https://colab.research.google.com/github/gnanadarshinim/code_demo/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
from textblob import TextBlob
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv("/content/sample_data/Tweets.csv")

In [None]:
# basic data exploration
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,5.70306e+17,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,5.70301e+17,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,5.70301e+17,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,5.70301e+17,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,5.70301e+17,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [None]:
df.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [None]:

df=df.drop(['tweet_id', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],axis=1)

In [None]:
df.head(3)

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...


In [None]:
df.info

<bound method DataFrame.info of       airline_sentiment                                               text
0               neutral                @VirginAmerica What @dhepburn said.
1              positive  @VirginAmerica plus you've added commercials t...
2               neutral  @VirginAmerica I didn't today... Must mean I n...
3              negative  @VirginAmerica it's really aggressive to blast...
4              negative  @VirginAmerica and it's a really big bad thing...
...                 ...                                                ...
14635          positive  @AmericanAir thank you we got on a different f...
14636          negative  @AmericanAir leaving over 20 minutes Late Flig...
14637           neutral  @AmericanAir Please bring American Airlines to...
14638          negative  @AmericanAir you have my money, you change my ...
14639           neutral  @AmericanAir we have 8 ppl so we need 2 know h...

[14640 rows x 2 columns]>

In [None]:
df.shape

(14640, 2)

In [None]:
df.isna().sum()

airline_sentiment    0
text                 0
dtype: int64

In [None]:
df.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [None]:
#cleaning unnessary tags and links from the tweets
def cleantxt(text):
    text=re.sub(r'@[A-Za-z0-9_]+','',text)
    text=re.sub(r'#[A-Za-z0-9_]+',"",text)
    text=re.sub(r'RT[\s]+','',text)
    text=re.sub(r'https:\/\/\s+','',text)
    text = re.sub(u'['
    u'\U0001F300-\U0001F64F'
    u'\U0001F680-\U0001F6FF'
    u'\u2600-\u26FF\u2700-\u27BF]+','',text)
    text = re.sub('http[^\s]+','',text)

    return text

In [None]:
def remove_noneng(x):
    words=set(nltk.corpus.words.words())
    filsen=" ".join(w for w in nltk.wordpunct_tokenize(x)\
                    if w.lower() in words or not w.isalpha())
    return filsen


In [None]:
df["text"]=df["text"].apply(cleantxt)


In [None]:
df["text"]

0                                              What  said.
1         plus you've added commercials to the experien...
2         I didn't today... Must mean I need to take an...
3         it's really aggressive to blast obnoxious "en...
4                 and it's a really big bad thing about it
                               ...                        
14635     thank you we got on a different flight to Chi...
14636     leaving over 20 minutes Late Flight. No warni...
14637                   Please bring American Airlines to 
14638     you have my money, you change my flight, and ...
14639     we have 8 ppl so we need 2 know how many seat...
Name: text, Length: 14640, dtype: object

In [None]:
#now we have cleaned our dataset its time to tokenisation
def getsub(text):
    return TextBlob(text).sentiment.subjectivity
def getpola(text):
    return TextBlob(text).sentiment.polarity

In [None]:
df['subjectivity']=df['text'].apply(getsub)
df['polarity']=df['text'].apply(getpola)

In [None]:
df.head(3)

Unnamed: 0,airline_sentiment,text,subjectivity,polarity
0,neutral,What said.,0.0,0.0
1,positive,plus you've added commercials to the experien...,0.0,0.0
2,neutral,I didn't today... Must mean I need to take an...,0.6875,-0.390625


In [None]:
def analysis(score):
    if(score<0):
        return 'negative'
    elif(score>=0):
        return 'positive'
df['analysis']=df['polarity'].apply(analysis)

In [None]:
sentiment_mapping = {"positive": 1, "negative": 0,"neutral":1}

# Use the map function to create a new column with 0s and 1s
df['label'] = df['airline_sentiment'].map(sentiment_mapping)

In [None]:
# the final data set after performing pre processing
df

Unnamed: 0,airline_sentiment,text,subjectivity,polarity,analysis,label
0,neutral,What said.,0.000000,0.000000,positive,1
1,positive,plus you've added commercials to the experien...,0.000000,0.000000,positive,1
2,neutral,I didn't today... Must mean I need to take an...,0.687500,-0.390625,negative,1
3,negative,"it's really aggressive to blast obnoxious ""en...",0.350000,0.006250,positive,0
4,negative,and it's a really big bad thing about it,0.383333,-0.350000,negative,0
...,...,...,...,...,...,...
14635,positive,thank you we got on a different flight to Chi...,0.600000,0.000000,positive,1
14636,negative,leaving over 20 minutes Late Flight. No warni...,0.600000,-0.300000,negative,0
14637,neutral,Please bring American Airlines to,0.000000,0.000000,positive,1
14638,negative,"you have my money, you change my flight, and ...",0.375000,-0.125000,negative,0
