<a href="https://colab.research.google.com/github/gupta24789/sentiment-analysis/blob/main/01_generate_train_val.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import re
import os
import string
import pandas as pd
import numpy as np
import random
import nltk
from nltk.corpus import stopwords, twitter_samples
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

nltk.download('stopwords')
nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

## Load Data

In [9]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [10]:
positive_tweets[:3]

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!']

In [11]:
negative_tweets[:3]

['hopeless for tmr :(',
 "Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(",
 '@Hegelbon That heart sliding into the waste basket. :(']

## Process the Data

In [12]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [13]:
positive_tweets_list = []
for tweet in positive_tweets:
  positive_tweets_list.append({
      "raw_tweet": tweet,
      "processed_tweet": process_tweet(tweet),
      "label": 1
  })

In [14]:
negative_tweets_list = []
for tweet in negative_tweets:
  negative_tweets_list.append({
      "raw_tweet": tweet,
      "processed_tweet": process_tweet(tweet),
      "label": 0
  })

In [15]:
print(f"Positive : {len(positive_tweets_list)}\nNegative : {len(negative_tweets_list)}")

Positive : 5000
Negative : 5000


In [16]:
index_list = list(range(0, len(positive_tweets_list)))
random.shuffle(index_list)

train_size = int(0.8 * 5000)
train_index = index_list[:train_size]
val_index = index_list[train_size:]

In [17]:
train = [positive_tweets_list[i] for i in train_index] + [negative_tweets_list[i] for i in train_index]
val = [positive_tweets_list[i] for i in val_index] + [negative_tweets_list[i] for i in val_index]

In [18]:
len(train), len(val)

(8000, 2000)

In [19]:
train_df = pd.DataFrame(train)
val_df = pd.DataFrame(val)

In [20]:
train_df.label.value_counts()

1    4000
0    4000
Name: label, dtype: int64

In [21]:
val_df.label.value_counts()

1    1000
0    1000
Name: label, dtype: int64

In [23]:
OUTPUT_DIR = "data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [24]:
train_df.to_csv(f"{OUTPUT_DIR}/train.csv", index = False)
val_df.to_csv(f"{OUTPUT_DIR}/val.csv", index = False)