In [49]:
import pathlib

BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
EXPORT_DIR = DATASET_DIR / "exports"
EXPORT_DIR.mkdir(exist_ok=True, parents=True)
SPAM_DATASET_PATH = EXPORT_DIR / "spam-dataset.csv"

ZIPS_DIR = DATASET_DIR / 'zips'
ZIPS_DIR.mkdir(exist_ok=True, parents=True)

SPAM_SMS_ZIP_PATH = ZIPS_DIR / "sms-spam-dataset.zip"
SPAM_YOUTUBE_ZIP_PATH = ZIPS_DIR / "youtube-spam-dataset.zip"

In [2]:
SMS_SPAM_ZIP = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
YOUTUBE_SPAM_ZIP = "https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip"

In [3]:
!curl $SMS_SPAM_ZIP -o $SPAM_SMS_ZIP_PATH

!curl $YOUTUBE_SPAM_ZIP -o $SPAM_YOUTUBE_ZIP_PATH       

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  198k  100  198k    0     0   428k      0 --:--:-- --:--:-- --:--:--  427k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  159k  100  159k    0     0   350k      0 --:--:-- --:--:-- --:--:--  350k


In [4]:
SPAM_CLASSIFIER_DIR = DATASET_DIR / "spam-classifier"
SMS_SPAM_DIR = SPAM_CLASSIFIER_DIR / "spam-sms"
YOUTUBE_SPAM_DIR = SPAM_CLASSIFIER_DIR / "youtube-spam"


SMS_SPAM_DIR.mkdir(exist_ok=True, parents=True)
YOUTUBE_SPAM_DIR.mkdir(exist_ok=True, parents=True)

In [5]:
!unzip -o $SPAM_SMS_ZIP_PATH -d $SMS_SPAM_DIR
!unzip -o $SPAM_YOUTUBE_ZIP_PATH -d $YOUTUBE_SPAM_DIR

Archive:  /Users/cfe/Dev/ai-api/datasets/zips/sms-spam-dataset.zip
  inflating: /Users/cfe/Dev/ai-api/datasets/spam-classifier/spam-sms/SMSSpamCollection  
  inflating: /Users/cfe/Dev/ai-api/datasets/spam-classifier/spam-sms/readme  
Archive:  /Users/cfe/Dev/ai-api/datasets/zips/youtube-spam-dataset.zip
  inflating: /Users/cfe/Dev/ai-api/datasets/spam-classifier/youtube-spam/Youtube01-Psy.csv  
  inflating: /Users/cfe/Dev/ai-api/datasets/spam-classifier/youtube-spam/__MACOSX/._Youtube01-Psy.csv  
  inflating: /Users/cfe/Dev/ai-api/datasets/spam-classifier/youtube-spam/Youtube02-KatyPerry.csv  
  inflating: /Users/cfe/Dev/ai-api/datasets/spam-classifier/youtube-spam/__MACOSX/._Youtube02-KatyPerry.csv  
  inflating: /Users/cfe/Dev/ai-api/datasets/spam-classifier/youtube-spam/Youtube03-LMFAO.csv  
  inflating: /Users/cfe/Dev/ai-api/datasets/spam-classifier/youtube-spam/__MACOSX/._Youtube03-LMFAO.csv  
  inflating: /Users/cfe/Dev/ai-api/datasets/spam-classifier/youtube-spam/Youtube04-Emine

In [8]:
sms_spam_input_path = SMS_SPAM_DIR / "SMSSpamCollection" # tsv
# sms_spam_input_path.read_text()



In [9]:
for path in YOUTUBE_SPAM_DIR.glob("*"):
    print(path)

/Users/cfe/Dev/ai-api/datasets/spam-classifier/youtube-spam/Youtube03-LMFAO.csv
/Users/cfe/Dev/ai-api/datasets/spam-classifier/youtube-spam/Youtube04-Eminem.csv
/Users/cfe/Dev/ai-api/datasets/spam-classifier/youtube-spam/Youtube05-Shakira.csv
/Users/cfe/Dev/ai-api/datasets/spam-classifier/youtube-spam/Youtube02-KatyPerry.csv
/Users/cfe/Dev/ai-api/datasets/spam-classifier/youtube-spam/__MACOSX
/Users/cfe/Dev/ai-api/datasets/spam-classifier/youtube-spam/Youtube01-Psy.csv


In [10]:
import pandas as pd

In [36]:
sms_spam_input_path = SMS_SPAM_DIR / "SMSSpamCollection"
sms_df = pd.read_csv(sms_spam_input_path, sep='\t', header=None)
sms_df.columns = ['label', 'text']
sms_df['source'] = 'sms-spam'
sms_df.tail()

Unnamed: 0,label,text,source
5567,spam,This is the 2nd time we have tried 2 contact u...,sms-spam
5568,ham,Will ü b going to esplanade fr home?,sms-spam
5569,ham,"Pity, * was in mood for that. So...any other s...",sms-spam
5570,ham,The guy did some bitching but I acted like i'd...,sms-spam
5571,ham,Rofl. Its true to its name,sms-spam


In [37]:
1 == True

True

In [46]:

my_dfs = []
for path in YOUTUBE_SPAM_DIR.glob("*.csv"):
    raw_df = pd.read_csv(path)
    raw_df.rename(columns={"CLASS": 'raw_label', "CONTENT": "text"}, inplace=True)
    raw_df['label'] = raw_df['raw_label'].apply(lambda x: "spam" if str(x) == "1" else "ham")
    raw_df['raw_source'] = str(path.name)
    raw_df['source'] = 'youtube-spam'
    df = raw_df.copy()[['label', 'text', 'source']]
    my_dfs.append(df)
    # print(df.head())

yt_df = pd.concat(my_dfs)

In [47]:
yt_df.head()

Unnamed: 0,label,text,source
0,ham,"<a href=""http://www.youtube.com/watch?v=KQ6zr6...",youtube-spam
1,ham,wierd but funny﻿,youtube-spam
2,spam,"Hey guys, I&#39;m a human.<br /><br /><br />Bu...",youtube-spam
3,ham,Party Rock....lol...who wants to shuffle!!!﻿,youtube-spam
4,ham,Party rock﻿,youtube-spam


In [48]:
df = pd.concat([sms_df, yt_df])
df.head()

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam
3,ham,U dun say so early hor... U c already then say...,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms-spam


In [50]:
df.to_csv(SPAM_DATASET_PATH, index=False)