# Sentiment Analysis

In [8]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import re # regex


## Data Preprocessing

In [5]:
df = pd.read_csv('datasets/twitter-2013dev-A.txt', delimiter='\t', names=['id', 'sentiment', 'tweet'])
df

Unnamed: 0,id,sentiment,tweet
0,260097528899452929,neutral,Won the match #getin . Plus\u002c tomorrow is ...
1,263791921753882624,neutral,Some areas of New England could see the first ...
2,264194578381410304,negative,@francesco_con40 2nd worst QB. DEFINITELY Tony...
3,264041328420204544,neutral,#Thailand Washington - US President Barack Oba...
4,263816256640126976,neutral,Did y\u2019all hear what Tony Romo dressed up ...
...,...,...,...
1649,264241571908681728,neutral,#WEB YouTube improves upload process with opti...
1650,264228980444495875,positive,Gonna change my Tumblr theme. I hope I can fin...
1651,264210367192915968,neutral,I\u2019m so jealous of everyone at the Justin ...
1652,263737249240342528,neutral,Jim Harbaugh\u002c Alex Smith Drive Giants Wor...


Text will be preprocessed:
1. Unicode escape sequence `\u` will be decoded.
2. Punctuations will be removed.
3. `#` and `@` will also be removed from the texts.

In [6]:
df['tweet'] = df['tweet'].apply(lambda x: x.encode('utf-8').decode('unicode_escape'))
df

  df['tweet'] = df['tweet'].apply(lambda x: x.encode('utf-8').decode('unicode_escape'))
  df['tweet'] = df['tweet'].apply(lambda x: x.encode('utf-8').decode('unicode_escape'))


Unnamed: 0,id,sentiment,tweet
0,260097528899452929,neutral,"Won the match #getin . Plus, tomorrow is a ver..."
1,263791921753882624,neutral,Some areas of New England could see the first ...
2,264194578381410304,negative,@francesco_con40 2nd worst QB. DEFINITELY Tony...
3,264041328420204544,neutral,#Thailand Washington - US President Barack Oba...
4,263816256640126976,neutral,Did y’all hear what Tony Romo dressed up as fo...
...,...,...,...
1649,264241571908681728,neutral,#WEB YouTube improves upload process with opti...
1650,264228980444495875,positive,Gonna change my Tumblr theme. I hope I can fin...
1651,264210367192915968,neutral,I’m so jealous of everyone at the Justin Biebe...
1652,263737249240342528,neutral,"Jim Harbaugh, Alex Smith Drive Giants World Se..."


Questions:
1. Should we remove the words following @ and #?
2. Should we remove numbers? like 2nd?

In [9]:
def clean_text(text):
    """
    Remove @, #, and punctuation marks from the text.

    Parameters:
        text (str): The tweet text.
    Returns:
        str: The cleaned tweet text.
    """
    # Replace @, #, and non-word characters with a space
    # cleaned_text = re.sub(r'[@#]+|\W', ' ', text)


    # Replace @ and the following text, # and the following text, and non-word characters with a space
    cleaned_text = re.sub(r'@[\w]+|#[\w]+|\W', ' ', text)

    # Remove extra spaces
    cleaned_text = ' '.join(cleaned_text.split())

    return cleaned_text

df['cleaned_text'] = df['tweet'].apply(clean_text)
df

Unnamed: 0,id,sentiment,tweet,cleaned_text
0,260097528899452929,neutral,"Won the match #getin . Plus, tomorrow is a ver...",Won the match getin Plus tomorrow is a very bu...
1,263791921753882624,neutral,Some areas of New England could see the first ...,Some areas of New England could see the first ...
2,264194578381410304,negative,@francesco_con40 2nd worst QB. DEFINITELY Tony...,francesco_con40 2nd worst QB DEFINITELY Tony R...
3,264041328420204544,neutral,#Thailand Washington - US President Barack Oba...,Thailand Washington US President Barack Obama ...
4,263816256640126976,neutral,Did y’all hear what Tony Romo dressed up as fo...,Did y all hear what Tony Romo dressed up as fo...
...,...,...,...,...
1649,264241571908681728,neutral,#WEB YouTube improves upload process with opti...,WEB YouTube improves upload process with optio...
1650,264228980444495875,positive,Gonna change my Tumblr theme. I hope I can fin...,Gonna change my Tumblr theme I hope I can fini...
1651,264210367192915968,neutral,I’m so jealous of everyone at the Justin Biebe...,I m so jealous of everyone at the Justin Biebe...
1652,263737249240342528,neutral,"Jim Harbaugh, Alex Smith Drive Giants World Se...",Jim Harbaugh Alex Smith Drive Giants World Ser...


In [7]:
from nltk.tokenize import word_tokenize
df['tokenized_tweet'] = df['tweet'].apply(word_tokenize)
df

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\willi/nltk_data'
    - 'c:\\Users\\willi\\Desktop\\_77\\_Activities\\venvs\\py-acts\\nltk_data'
    - 'c:\\Users\\willi\\Desktop\\_77\\_Activities\\venvs\\py-acts\\share\\nltk_data'
    - 'c:\\Users\\willi\\Desktop\\_77\\_Activities\\venvs\\py-acts\\lib\\nltk_data'
    - 'C:\\Users\\willi\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


## Model Selection and Training