In [1]:
import pandas as pd
import re
import pkg_resources

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from experiment_ensemble import cross_validation
from sklearn.preprocessing import StandardScaler
from sklearn import svm



In [44]:
train=pd.read_csv('newData/video_train.csv')
test=pd.read_csv('newData/video_test.csv')
all_data = train.append(test)

def load_stp():
    stp_set = set()
    data_file = open('stop_word.dic')
    for line in data_file:
        line = line.strip()
        if line not in stp_set:
            stp_set.add(line)
    data_file.close()
    return stp_set

def text_cleaner(text):
    stopword = load_stp()
    text = text.strip()
    text = re.sub('\n', ' ', text)
    text = re.sub('\r', ' ', text)
    #text = re.sub('\d', ' ', text)
    text = re.sub(r'\W+', ' ', text)
    text = text.split(' ')
    text = ' '.join(filter(lambda x: x.lower() not in stopword, text))
    return text

def text_cleaner_with_stemming(text):
    stopword = load_stp()
    text = text.strip()
    text = re.sub('\n', ' ', text)
    text = re.sub('\r', ' ', text)
    text = text.split(' ')
    text = ' '.join(filter(lambda x: x.lower() not in stopword, text))
    
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    output   = stemmer.stem(text)
    stopword = load_stp()
    return output

In [42]:
def extract_part(data):
    clean_text = []
    bitrate = []
    label = []
    for index, row in data.iterrows():
        clean_text.append(text_cleaner(str(row['title'])) + ' ' +text_cleaner(str(row['description'])))
        bitrate.append([row['original_bitrate'], row['original_audio_bitrate'], row['original_video_bitrate'], row['duration']])
        label.append(row['is_spam'])
        #print index
    return clean_text, bitrate, label

In [45]:

all_data.shape
#def cross_validation(clean_text, label, clf):

(7977, 10)

In [53]:
clean_text, bitrate, label = extract_part (all_data)

In [54]:

columns = ['is_spam', 'text', 'original_bitrate', 'original_audio_bitrate', 'original_video_bitrate', 'duration']
df = pd.DataFrame(columns=columns)
for i in range(len(clean_text)):
    new_list = [label[i], clean_text[i]] + bitrate[i]
    new_row = pd.DataFrame([new_list], columns=columns)
    df = df.append(new_row)
    

In [55]:
df.to_csv('all_video_text.csv', index=False)

In [2]:
#Experiment TF
data = pd.read_csv('all_video_clean_text.csv')
clean_text = data['text']
label = data['is_spam']
from experiment_ensemble import cross_validation
#text_clf = Pipeline([
#    ('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),
#    ('classifier', LogisticRegression())
#])

clf = MLPClassifier(solver='adam', alpha=1e-4, hidden_layer_sizes=(30,3), batch_size=10,random_state=1, early_stopping=False, verbose=False, validation_fraction=0.15)
text_clf = Pipeline([
        ('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),
        #('scaler', StandardScaler()),
        ('clf', clf)
])

print (cross_validation(clean_text, label, text_clf))

CROSSVALIDATION ITERATION 1
CROSSVALIDATION ITERATION 2
CROSSVALIDATION ITERATION 3
CROSSVALIDATION ITERATION 4
CROSSVALIDATION ITERATION 5
CROSSVALIDATION ITERATION 6
CROSSVALIDATION ITERATION 7
CROSSVALIDATION ITERATION 8
CROSSVALIDATION ITERATION 9
CROSSVALIDATION ITERATION 10
(0.94833175273647363, 0.9657152693159482, 0.97563636471821202, 0.95750324408781551)


In [7]:
#Experiment TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(ngram_range=(1, 2))
text_clf = Pipeline([
        ('count_vectorizer', cv),
        ('clf', svm.SVC(tol = 1))
])

#clf = MLPClassifier(solver='adam', alpha=1e-4, hidden_layer_sizes=(30,3), batch_size=10,random_state=1, early_stopping=False, verbose=False, validation_fraction=0.15)
#text_clf = Pipeline([
#        ('count_vectorizer', cv),
#        ('clf', clf)])

print (cross_validation(clean_text, label, text_clf))

CROSSVALIDATION ITERATION 1
CROSSVALIDATION ITERATION 2
CROSSVALIDATION ITERATION 3
CROSSVALIDATION ITERATION 4
CROSSVALIDATION ITERATION 5
CROSSVALIDATION ITERATION 6
CROSSVALIDATION ITERATION 7
CROSSVALIDATION ITERATION 8
CROSSVALIDATION ITERATION 9
CROSSVALIDATION ITERATION 10
(0.77008937309912828, 0.87011349803690496, 0.77008937309912828, 1.0)


In [9]:
# TF with bitrate

data = pd.read_csv('all_video_clean_text.csv')
clean_text = data['text']
bitrate = data[['original_bitrate', 'original_audio_bitrate', 'original_video_bitrate', 'duration']]
label = data['is_spam']

text_clf = Pipeline([
    ('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),
    ('classifier', clf)
])
clf_bitrate = svm.SVC(tol=1)

print (cross_validation_with_bitrate(clean_text, bitrate.as_matrix(), label, text_clf, clf_bitrate))


CROSSVALIDATION ITERATION 1
CROSSVALIDATION ITERATION 2
CROSSVALIDATION ITERATION 3
CROSSVALIDATION ITERATION 4
CROSSVALIDATION ITERATION 5
CROSSVALIDATION ITERATION 6
CROSSVALIDATION ITERATION 7
CROSSVALIDATION ITERATION 8
CROSSVALIDATION ITERATION 9
CROSSVALIDATION ITERATION 10
(0.97191659903461125, 0.98218604029849155, 0.97151589023349738, 0.9934864013135245)


In [6]:


#Experiment TFIDF with bitrate
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(ngram_range=(1, 2))
clf = MLPClassifier(solver='adam', alpha=1e-4, hidden_layer_sizes=(30,3), batch_size=10,random_state=1, early_stopping=False, verbose=False, validation_fraction=0.15)
pipeline = Pipeline([
        ('count_vectorizer', cv),
        ('clf', clf)
])
clf_bitrate = svm.SVC(tol=1)
print (cross_validation_with_bitrate(clean_text, bitrate.as_matrix(), label, pipeline, clf_bitrate))

NameError: name 'cross_validation_with_bitrate' is not defined

In [8]:
type(clean_text)

pandas.core.series.Series

In [41]:
clf = MLPClassifier(solver='adam', alpha=1e-4, hidden_layer_sizes=(30,3), batch_size=10,random_state=1, early_stopping=False, verbose=False, validation_fraction=0.15)
clf_bitrate =  LogisticRegression()
print (cross_validation(bitrate.as_matrix(), label, clf_bitrate))

CROSSVALIDATION ITERATION 1
CROSSVALIDATION ITERATION 2
CROSSVALIDATION ITERATION 3
CROSSVALIDATION ITERATION 4
CROSSVALIDATION ITERATION 5
CROSSVALIDATION ITERATION 6
CROSSVALIDATION ITERATION 7
CROSSVALIDATION ITERATION 8
CROSSVALIDATION ITERATION 9
CROSSVALIDATION ITERATION 10
(0.80971605942414493, 0.88991728787228297, 0.80501606719936092, 0.99560260586319238)


In [4]:
# Ensemble TF
from sklearn.feature_extraction.text import TfidfVectorizer
data = pd.read_csv('all_video_clean_text.csv')
clean_text = data['text']
label = data['is_spam']
bitrate = data[['original_bitrate', 'original_audio_bitrate', 'original_video_bitrate', 'duration']]

text_clf_1 = Pipeline([
        ('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),
        ('clf', LogisticRegression())
])
text_clf_2 = Pipeline([
        ('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),
        ('clf', MultinomialNB())
])
text_clf_3 = Pipeline([
        ('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),
        ('clf', svm.SVC(tol=1))
])

bitrate_clf = svm.SVC(tol=1)

from experiment_ensemble import cross_validation_ensemble

print(cross_validation_ensemble(clean_text, bitrate.as_matrix(), label, [text_clf_1, text_clf_2, text_clf_3] , bitrate_clf))

CROSSVALIDATION ITERATION 1
CROSSVALIDATION ITERATION 2
CROSSVALIDATION ITERATION 3
CROSSVALIDATION ITERATION 4
CROSSVALIDATION ITERATION 5
CROSSVALIDATION ITERATION 6
CROSSVALIDATION ITERATION 7
CROSSVALIDATION ITERATION 8
CROSSVALIDATION ITERATION 9
CROSSVALIDATION ITERATION 10
(0.9710409771836559, 0.98177872048354242, 0.96810483581590534, 0.99641746775773954)


In [2]:

data = pd.read_csv('all_video_clean_text.csv')
clean_text = data['text']
bitrate = data[['original_bitrate', 'original_audio_bitrate', 'original_video_bitrate', 'duration']]
label = data['is_spam']

In [4]:
sum(label)


6143.0

In [3]:
bitrate['label'] = label

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [6]:
bitrate.to_csv('bitrate.csv', index=False)