**Imports**

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

**Text Cleaning Util**

In [None]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    text = re.sub(r'\s+', ' ', text).strip()
    return text


## SPAM

In [113]:
df = pd.read_csv("..\data\orig\main\spam\data.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [114]:
df = df[["v1", "v2"]]
df["v1"] = df["v1"].apply(lambda x: 1 if x=="spam" else 0)
df["v2"] = df["v2"].apply(clean_text)
df.rename(columns={"v1":"y","v2":"text"}, inplace=True)

In [115]:
df = df.sample(frac=1).reset_index(drop=True)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=65)

df_train.to_csv("../data/orig/processed/train/spam-data.csv", index=False)
df_test.to_csv("../data/orig/processed/test/spam-data.csv", index=False)

In [2]:
#check
df_train = pd.read_csv("../data/orig/processed/train/spam-data.csv")
df_train['y'].value_counts()

y
0    3855
1     602
Name: count, dtype: int64

## SENTIMENT

In [78]:
df = pd.read_csv("..\data\orig\main\sentiment\data.csv", encoding='latin-1', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [83]:
df = df[[0,5]]
df.rename(columns={0:'y',5:'text'}, inplace=True)
df['y'] = df['y'].apply(lambda x: 1 if x==4 else 0)
df['text'] = df['text'].apply(clean_text) #this takes 5mins+
df = df.sample(frac=1).reset_index(drop=True)

In [94]:
df_train, df_test = train_test_split(df, random_state=46, test_size=0.3)

In [96]:
df_train.to_csv("../data/orig/processed/train/sentiment-data.csv", index=False)
df_test.to_csv("../data/orig/processed/test/sentiment-data.csv", index=False)

## NEWS

In [74]:
df_1 = pd.read_csv("..\\data\\orig\\main\\news\\BuzzFeed_fake_news_content.csv")
df_2 = pd.read_csv("..\\data\\orig\\main\\news\\BuzzFeed_real_news_content.csv")

print("No. of Rows (Fake): ", df_1.shape[0])
print("No. of Rows (Real): ", df_2.shape[0])

No. of Rows (Fake):  91
No. of Rows (Real):  91


In [75]:
df_1['text'] = df_1['title'] + ' ' + df_1['text']
df_2['text'] = df_2['title'] + ' ' + df_2['text']

df_1 = df_1[['text']]
df_2 = df_2[['text']]

df_1['text'] = df_1['text'].apply(clean_text)
df_2['text'] = df_2['text'].apply(clean_text)

df_1['y'] = 1
df_2['y'] = 0

df1_train, df1_test = train_test_split(df_1, random_state=42)
df2_train, df2_test = train_test_split(df_2, random_state=36)

In [76]:
df_train = pd.concat([df1_train, df2_train], ignore_index=True)
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train.to_csv("../data/orig/processed/train/news-data.csv", index=False)

df_test = pd.concat([df1_test, df2_test], ignore_index=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
df_test.to_csv("../data/orig/processed/test/news-data.csv", index=False)

**Minis**


In [4]:
df_spam = pd.read_csv("..\\data\\orig\\processed\\train\\spam-data.csv")
df_spam['y'].value_counts()

y
0    3855
1     602
Name: count, dtype: int64

In [6]:
df_spam = pd.concat([df_spam[df_spam['y']==1], df_spam[df_spam['y']==0].sample(n=602)], ignore_index=True)

In [8]:
df_spam.to_csv("..\\data\\orig\\processed\\train\\spam-data-mini.csv")
df_spam.shape

(1204, 2)

In [3]:
df_sentiment = pd.read_csv("..\\data\\orig\\processed\\train\\sentiment-data.csv")
df_sentiment.shape

(1200000, 2)

In [4]:
df_sentiment['y'].value_counts()

y
1    600016
0    599984
Name: count, dtype: int64

In [5]:
n = 2500
df_sentiment = pd.concat([df_sentiment[df_sentiment['y']==1].sample(n=n), df_sentiment[df_sentiment['y']==0].sample(n=n)], ignore_index=True)

In [7]:
df_sentiment.to_csv("..\\data\\orig\\processed\\train\\sentiment-data-mini.csv")
df_sentiment.shape

(5000, 2)