In [4]:
import pandas as pd
from pathlib import Path
import numpy as np

input_folder_path = Path("data/raw")
train_path = "training.1600000.processed.noemoticon.csv"

    # Reading the dataset with no columns titles and with latin encoding 
df = pd.read_csv(train_path, sep = ",", encoding='latin-1', header=None, error_bad_lines=False)

# As the data has no column titles, we will add our own
df.columns = ["label", "time", "date", "query", "username", "text"]

In [15]:
def process_tweet(tweet):
    '''
    Input: 
        tweet: a string containing a tweet
    Output:
        clean_tweet: a list of words containing the processed tweet
    
    '''

    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    clean_tweet = []
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
            word not in string.punctuation): # remove punctuation
            #clean_tweet.append(word)
            stem_word = stemmer.stem(word) # stemming word
            clean_tweet.append(stem_word)
    
    return clean_tweet

In [16]:
df_pos = df[df['label'] == 4]
df_neg = df[df['label'] == 0]
    
    # Only retaining 1/4th of our data from each output group
    # Feel free to alter the dividing factor depending on your workspace
    # 1/64 is a good place to start if you're unsure about your machine's power
df_pos = df_pos.iloc[:int(len(df_pos)/4)]
df_neg = df_neg.iloc[:int(len(df_neg)/4)]
print(len(df_pos), len(df_neg))

all_positive_tweets = df_pos.text.to_list()
all_negative_tweets = df_neg.text.to_list()

val_pos   = all_positive_tweets[40000:] # generating validation set for positive tweets
train_pos  = all_positive_tweets[:40000]# generating training set for positive tweets

200000 200000


In [17]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [14]:
import re
import nltk
import string
nltk.download('twitter_samples')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples 
import pandas as pd
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Ricard\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ricard\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
stopwords_english = stopwords.words('english')
val_pos   = all_positive_tweets[40000:80000] # generating validation set for positive tweets
train_pos  = all_positive_tweets[:40000]# generating training set for positive tweets

# Split negative set into validation and training
val_neg   = all_negative_tweets[40000:80000] # generating validation set for negative tweets
train_neg  = all_negative_tweets[:40000] # generating training set for nagative tweets

# Delete all_positive_tweets and all_negative_tweets from memory
del all_positive_tweets
del all_negative_tweets

# Combine training data into one set
train_x = train_pos + train_neg 

# Combine validation data into one set
val_x  = val_pos + val_neg

# Set the labels for the training set (1 for positive, 0 for negative)
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))

# Set the labels for the validation set (1 for positive, 0 for negative)
val_y  = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))


# Build the vocabulary

# Include special tokens 
# started with pad, end of line and unk tokens
Vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

# Note that we build vocab using training data
for tweet in train_x: 
    processed_tweet = process_tweet(tweet)
    for word in processed_tweet:
        if word not in Vocab: 
            Vocab[word] = len(Vocab)

In [None]:
train_pos, train_neg, val_pos, val_neg, train_x, val_x, train_y, val_y, Vocab

In [27]:
Vocab

{'__PAD__': 0,
 '__</e>__': 1,
 '__UNK__': 2,
 'love': 3,
 'u': 4,
 'guy': 5,
 'r': 6,
 'best': 7,
 'im': 8,
 'meet': 9,
 'one': 10,
 'besti': 11,
 'tonight': 12,
 'cant': 13,
 'wait': 14,
 'girl': 15,
 'talk': 16,
 'thank': 17,
 'twitter': 18,
 'add': 19,
 'sunisa': 20,
 'got': 21,
 'hin': 22,
 'show': 23,
 'dc': 24,
 'area': 25,
 'sweetheart': 26,
 'sick': 27,
 'realli': 28,
 'cheap': 29,
 'hurt': 30,
 'much': 31,
 'eat': 32,
 'real': 33,
 'food': 34,
 'plu': 35,
 'friend': 36,
 'make': 37,
 'soup': 38,
 'effect': 39,
 'everyon': 40,
 'tell': 41,
 'burst': 42,
 'laugh': 43,
 'loud': 44,
 'come': 45,
 'sulk': 46,
 'than': 47,
 'respons': 48,
 'ihad': 49,
 'alreadi': 50,
 'find': 51,
 'answer': 52,
 'jealou': 53,
 'hope': 54,
 'great': 55,
 'time': 56,
 'vega': 57,
 'like': 58,
 "acm'": 59,
 'ah': 60,
 'congrat': 61,
 'mr': 62,
 'fletcher': 63,
 'final': 64,
 'join': 65,
 'respond': 66,
 'stupid': 67,
 'cat': 68,
 'help': 69,
 'type': 70,
 'forgiv': 71,
 'error': 72,
 'crazi': 73,
 'da

In [29]:
import pandas as pd
from pathlib import Path
import numpy as np

input_folder_path = Path("data/raw")
train_path = "training.1600000.processed.noemoticon.csv"

    # Reading the dataset with no columns titles and with latin encoding 
df = pd.read_csv(train_path, sep = ",", encoding='latin-1', header=None, error_bad_lines=False)

# As the data has no column titles, we will add our own
df.columns = ["label", "time", "date", "query", "username", "text"]

# Separating positive and negative rows
df_pos = df[df['label'] == 4]
df_neg = df[df['label'] == 0]
    
    # Only retaining 1/4th of our data from each output group
    # Feel free to alter the dividing factor depending on your workspace
    # 1/64 is a good place to start if you're unsure about your machine's power
df_pos = df_pos.iloc[:int(len(df_pos)/4)]
df_neg = df_neg.iloc[:int(len(df_neg)/4)]
print(len(df_pos), len(df_neg))

all_positive_tweets = df_pos.text.to_list()
all_negative_tweets = df_neg.text.to_list()

val_pos   = all_positive_tweets[40000:80000] # generating validation set for positive tweets
train_pos  = all_positive_tweets[:40000]# generating training set for positive tweets

# Split negative set into validation and training
val_neg   = all_negative_tweets[40000:80000] # generating validation set for negative tweets
train_neg  = all_negative_tweets[:40000] # generating training set for nagative tweets

# Delete all_positive_tweets and all_negative_tweets from memory
del all_positive_tweets
del all_negative_tweets

# Combine training data into one set
train_x = train_pos + train_neg 

# Combine validation data into one set
val_x  = val_pos + val_neg

# Set the labels for the training set (1 for positive, 0 for negative)
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))

# Set the labels for the validation set (1 for positive, 0 for negative)
val_y  = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))


# Build the vocabulary

# Include special tokens 
# started with pad, end of line and unk tokens
Vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

# Note that we build vocab using training data
for tweet in train_x: 
    processed_tweet = process_tweet(tweet)
    for word in processed_tweet:
        if word not in Vocab: 
            Vocab[word] = len(Vocab)

# Path of the output data folder
Path("data/processed").mkdir(exist_ok=True)
prepared_folder_path = Path("data/processed")

X_train_path = prepared_folder_path / "X_train.txt"
y_train_path = prepared_folder_path / "y_train.txt"
X_valid_path = prepared_folder_path / "X_valid.txt"
y_valid_path = prepared_folder_path / "y_valid.txt"
train_pos_path = prepared_folder_path / "train_pos.txt"
train_neg_path = prepared_folder_path / "train_neg.txt"
vocab_path = prepared_folder_path / "vocab.json"

with open(X_train_path, 'w') as temp_file:
    for item in train_x:
        temp_file.write("%s\n" % item)

with open(y_train_path , 'w') as temp_file:
    for item in train_y:
        temp_file.write("%s\n" % item)

with open(X_valid_path, 'w') as temp_file:
    for item in val_x:
        temp_file.write("%s\n" % item)

with open(y_valid_path, 'w') as temp_file:
    for item in val_y:
        temp_file.write("%s\n" % item)
        
with open(train_pos_path, 'w') as temp_file:
    for item in train_pos:
        temp_file.write("%s\n" % item)

with open(train_neg_path, 'w') as temp_file:
    for item in train_neg:
        temp_file.write("%s\n" % item)

with open(vocab_path, 'wb') as fp:
    pickle.dump(Vocab, fp)        

200000 200000


UnicodeEncodeError: 'charmap' codec can't encode character '\x9a' in position 1: character maps to <undefined>

In [31]:
with open(X_train_path, 'w', encoding = 'utf-8') as temp_file:
    for item in train_x:
        temp_file.write("%s\n" % item)

with open(y_train_path , 'w', encoding = 'utf-8') as temp_file:
    for item in train_y:
        temp_file.write("%s\n" % item)

with open(X_valid_path, 'w', encoding = 'utf-8') as temp_file:
    for item in val_x:
        temp_file.write("%s\n" % item)

with open(y_valid_path, 'w', encoding = 'utf-8') as temp_file:
    for item in val_y:
        temp_file.write("%s\n" % item)
        
with open(train_pos_path, 'w', encoding = 'utf-8') as temp_file:
    for item in train_pos:
        temp_file.write("%s\n" % item)

with open(train_neg_path, 'w', encoding = 'utf-8') as temp_file:
    for item in train_neg:
        temp_file.write("%s\n" % item)
        
import json
with open(vocab_path, 'w', encoding = 'utf-8') as fp:
    json.dump(Vocab, fp)  
      

ValueError: binary mode doesn't take an encoding argument

In [35]:
import json
with open(vocab_path, 'w', encoding = 'utf-8') as fp:
    json.dump(Vocab, fp)  

In [34]:
input_folder_path = Path("data/raw")


{'__PAD__': 0,
 '__</e>__': 1,
 '__UNK__': 2,
 'love': 3,
 'u': 4,
 'guy': 5,
 'r': 6,
 'best': 7,
 'im': 8,
 'meet': 9,
 'one': 10,
 'besti': 11,
 'tonight': 12,
 'cant': 13,
 'wait': 14,
 'girl': 15,
 'talk': 16,
 'thank': 17,
 'twitter': 18,
 'add': 19,
 'sunisa': 20,
 'got': 21,
 'hin': 22,
 'show': 23,
 'dc': 24,
 'area': 25,
 'sweetheart': 26,
 'sick': 27,
 'realli': 28,
 'cheap': 29,
 'hurt': 30,
 'much': 31,
 'eat': 32,
 'real': 33,
 'food': 34,
 'plu': 35,
 'friend': 36,
 'make': 37,
 'soup': 38,
 'effect': 39,
 'everyon': 40,
 'tell': 41,
 'burst': 42,
 'laugh': 43,
 'loud': 44,
 'come': 45,
 'sulk': 46,
 'than': 47,
 'respons': 48,
 'ihad': 49,
 'alreadi': 50,
 'find': 51,
 'answer': 52,
 'jealou': 53,
 'hope': 54,
 'great': 55,
 'time': 56,
 'vega': 57,
 'like': 58,
 "acm'": 59,
 'ah': 60,
 'congrat': 61,
 'mr': 62,
 'fletcher': 63,
 'final': 64,
 'join': 65,
 'respond': 66,
 'stupid': 67,
 'cat': 68,
 'help': 69,
 'type': 70,
 'forgiv': 71,
 'error': 72,
 'crazi': 73,
 'da

In [82]:
List = open(val_pos_path, encoding = 'utf-8').readlines()
for i in range(len(List)):
    List[i] = List[i].replace('\n', '')

In [67]:
List = open(y_valid_path, encoding = 'utf-8').readlines()
for i in range(len(List)):
    List[i] = float(List[i])
List = np.array(List)

In [83]:
List == val_pos

True

In [75]:
json_file = open(vocab_path, 'r', encoding = 'utf-8')

jsondata = json.load(json_file)

In [76]:
jsondata == Vocab

True

In [79]:
val_pos_path = prepared_folder_path / "val_pos.txt"
val_neg_path = prepared_folder_path / "val_neg.txt"

with open(val_pos_path, 'w', encoding = 'utf-8') as temp_file:
    for item in val_pos:
        temp_file.write("%s\n" % item)
        
with open(val_neg_path, 'w', encoding = 'utf-8') as temp_file:
    for item in val_neg:
        temp_file.write("%s\n" % item)