In [5]:
import os
import sklearn
import numpy as np
from glob import glob
import gensim
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline 

In [6]:
# data_dir = '/home/ed/github/pod_tweets/follower_twts/'
data_dir = '/run/media/ed/SD/follower_twts/'
categories = os.listdir(data_dir)
print(categories)

docs_to_train = sklearn.datasets.load_files(data_dir, description=None,
                                            categories=categories, load_content=True,
                                            encoding='utf-8', shuffle=True,
                                            random_state=42)

['AceOnTheHouse', '83WeekswithEricBischoff', '48Hours', 'Accused', '99Invisible', 'AdamCarollaShow', '2DopeQueens', '1YearDailyAudioBible', '1A', '30For30Podcasts', '60Minutes', 'PlanetMoney', 'AccidentalTechPodcast', 'PodSaveAmerica', 'AliceIsntDead', 'AlisonRosenIsYourNewBestFriend', 'BehindtheBastards', 'SeincastASeinfeldPodcast', 'KnowledgeFight']


In [8]:
# Split the loaded dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(docs_to_train.data, docs_to_train.target, test_size=0.2)
y_train

array([12, 13, 18, ..., 12,  7,  8])

In [9]:
test_list = X_train[6:8]

In [10]:
import gensim
import re
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import preprocessor as p


p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.EMOJI)
stemmer = SnowballStemmer('english')
punct_str = '''!"$%&'()*+,-./:;<=>?[\]^_`{|}~’'''
# stop_words = STOPWORDS
stop_words = STOPWORDS.union(set(['', 'ive', 'im', 'amp']))
#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)


def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


def my_preprocess(text):
    '''
    '''
    doc_emoji = split_count(text)
    doc_hash = find_hash(text)
    doc_mentions = find_mention(text)
    text = text.lower()
    text = text.replace('\\n',' ')
    text = p.clean(text)
    text = text.translate(str.maketrans(' ', ' ', punct_str))
    text = re.sub(r' \d+ ', ' ', text)
    text = re.sub(r' \d+ ', ' ', text)
    words = []
    for word in text.split(' '):
        words.append(word)
    words = [w for w in words if not w in stop_words]
    output = ' '.join(words)
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words and len(token) > 3:
            result.append(lemmatize_stemming(token))
    
    result = result + doc_emoji + doc_hash + doc_mentions
    tweet_txt = ' '.join(result)
    return tweet_txt

# HASHTAG_PATTERN = re.compile(r'#\w*')
# MENTION_PATTERN = re.compile(r'@\w*')

import emoji
import regex

def split_count(text):
    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)

    return emoji_list

def find_hash(text):
    hashtag_list = []
    data = regex.findall(r'#\w*', text)
    for word in data:
        hashtag_list.append(word)
    return hashtag_list

def find_mention(text):
    mention_list = []
    data = regex.findall(r'@\w*', text)
    for word in data:
        mention_list.append(word)
    return mention_list

doc_sample = X_train[7]
# doc_emoji = split_count(doc_sample)
# doc_hash = find_hash(doc_sample)
# doc_mentions = find_mention(doc_sample)
print('::Input test:')
print(doc_sample)

doc_sample = my_preprocess(doc_sample)

# all_words = doc_sample + doc_emoji + doc_hash + doc_mentions
# tweet_txt = ' '.join(all_words)
processed_txt = my_preprocess(doc_sample)
print('::Processed text:')
print(processed_txt)

::Input test:
["@saladinahmed Black Bolt is one of the best books at Marvel right now. I can't wait for this.", '@FanBrosShow Just figured out what to do with all the #ComicsICopped this year. https://t.co/hZjWpdLRKS', '@jaggedlittlehil Stop.', '@jaggedlittlehil This is so scary.', "@eveewing It was great to see you perform this in NYC. I'm excited for the book!", 'Really proud that my great uncle is a founding member of this group.\nhttps://t.co/T3cTDNWqmc', '@FanBrosShow Just finished all of the Patternist books. Really hard not to imagine an amazing Inhumans story there.', "America's slow but very real decline into a fascist state as told by the Milwaukee Bucks logo https://t.co/UTVsEyq3kg", "@ulabeast I totally remember working on this as an intern in Summer '10.", 'Deadass. :( https://t.co/QysTPMlLEG']
::Processed text:
black bolt best book marvel right wait figur year stop scari great perform excit book proud great uncl member group finish patternist book hard imagin amaz inhuman

In [11]:
reformat = lambda x: my_preprocess(x)
test_list_m = list(map(reformat, test_list))
test_list_m

['think power tattoo afraid underneath watch akira draw akira comic send help ahhhh look amaz handsom life dont know explain littl vivid composit life love inspir what poppin joint sleep entir life think dyinnggg ooz boiii long relationship wait accept ball sauc tast peopl draw slim fit black turtleneck draw slim fit black turtleneck draw slim fit black turtleneck draw slim fit black turtleneck draw slim fit black turtleneck draw @pynch__me @OneTrickTofani @JustinWabs @PancakeJoji @muwurder @_Cosmoetic_ @milkymosaic @hiya_cass @OneTrickTofani',
 'black bolt best book marvel right wait figur year stop scari great perform excit book proud great uncl found member group finish patternist book hard imagin amaz inhuman stori america slow real declin fascist state tell milwauke buck logo total rememb work intern summer deadass #ComicsICopped @saladinahmed @FanBrosShow @jaggedlittlehil @jaggedlittlehil @eveewing @FanBrosShow @ulabeast']

In [12]:
# Transform the training data into a form the classifier can work with
vectorizer = CountVectorizer(token_pattern=r'[^\s]+', encoding='unicode', lowercase=None, strip_accents=None, stop_words=None)
reformat = lambda x: my_preprocess(x)
X_train_m = list(map(reformat, X_train))
X = vectorizer.fit_transform(X_train_m)

tfidf_transformer = TfidfTransformer(use_idf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X)

text_clf = Pipeline([('vect', vectorizer),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, 
    verbose=1)),])
# text_clf.fit(X_, y_train)

In [13]:
X_train_counts = count_vect.fit_transform(raw_documents=X_train)

tfidf_transformer = TfidfTransformer(use_idf=True)

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

NameError: name 'count_vect' is not defined

In [14]:
# Transform the test data
count_vect = CountVectorizer(stop_words='english')
X_test_counts = count_vect.fit_transform(raw_documents=X_test)

tfidf_transformer = TfidfTransformer(use_idf=True)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)

In [15]:
# Pipeline Code
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, 
    verbose=1)),])

In [16]:
# Deploy the pipeline and train the model
text_clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


-- Epoch 1
Norm: 1.28, NNZs: 104078, Bias: -1.004174, T: 52290, Avg. loss: 0.107010
Total training time: 0.08 seconds.
-- Epoch 2
Norm: 1.12, NNZs: 138906, Bias: -1.003937, T: 104580, Avg. loss: 0.105449
Total training time: 0.14 seconds.
-- Epoch 3
Norm: 1.07, NNZs: 168383, Bias: -1.002710, T: 156870, Avg. loss: 0.105255
Total training time: 0.19 seconds.
-- Epoch 4
Norm: 1.05, NNZs: 192440, Bias: -1.002210, T: 209160, Avg. loss: 0.105150
Total training time: 0.26 seconds.
-- Epoch 5
Norm: 1.04, NNZs: 211046, Bias: -1.001665, T: 261450, Avg. loss: 0.105096
Total training time: 0.31 seconds.
-- Epoch 6
Norm: 1.03, NNZs: 225817, Bias: -1.001352, T: 313740, Avg. loss: 0.105057
Total training time: 0.36 seconds.
-- Epoch 7
Norm: 1.03, NNZs: 237374, Bias: -1.001159, T: 366030, Avg. loss: 0.105033
Total training time: 0.40 seconds.
Convergence after 7 epochs took 0.41 seconds
-- Epoch 1
Norm: 1.36, NNZs: 107937, Bias: -1.005974, T: 52290, Avg. loss: 0.099852
Total training time: 0.08 second

Norm: 1.39, NNZs: 193341, Bias: -1.003318, T: 261450, Avg. loss: 0.086432
Total training time: 0.35 seconds.
-- Epoch 6
Norm: 1.39, NNZs: 204903, Bias: -1.002548, T: 313740, Avg. loss: 0.086409
Total training time: 0.43 seconds.
-- Epoch 7
Norm: 1.39, NNZs: 213577, Bias: -1.002503, T: 366030, Avg. loss: 0.086378
Total training time: 0.50 seconds.
Convergence after 7 epochs took 0.50 seconds
-- Epoch 1
Norm: 1.35, NNZs: 86266, Bias: -1.005034, T: 52290, Avg. loss: 0.096742
Total training time: 0.12 seconds.
-- Epoch 2
Norm: 1.21, NNZs: 112812, Bias: -1.002988, T: 104580, Avg. loss: 0.095447
Total training time: 0.17 seconds.
-- Epoch 3
Norm: 1.17, NNZs: 133255, Bias: -1.001117, T: 156870, Avg. loss: 0.095278
Total training time: 0.22 seconds.
-- Epoch 4
Norm: 1.15, NNZs: 149981, Bias: -1.000883, T: 209160, Avg. loss: 0.095182
Total training time: 0.27 seconds.
-- Epoch 5
Norm: 1.14, NNZs: 163708, Bias: -1.000483, T: 261450, Avg. loss: 0.095137
Total training time: 0.32 seconds.
-- Epoch

[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:    9.2s finished


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      

In [17]:
# Test the trained model using the test data
predicted = text_clf.predict(X_test)

In [18]:
print (np.mean(predicted == y_test))

0.33664805323950125


In [19]:
print(metrics.classification_report(y_test, predicted, target_names=docs_to_train.target_names))

                                precision    recall  f1-score   support

                            1A       0.29      0.22      0.25       630
          1YearDailyAudioBible       0.36      0.55      0.43       633
                   2DopeQueens       0.18      0.17      0.17       658
               30For30Podcasts       0.37      0.45      0.41       646
                       48Hours       0.28      0.22      0.25       601
                     60Minutes       0.29      0.26      0.28       593
       83WeekswithEricBischoff       0.48      0.68      0.57       704
                   99Invisible       0.25      0.17      0.21       702
         AccidentalTechPodcast       0.41      0.57      0.48       729
                       Accused       0.36      0.49      0.41       563
                 AceOnTheHouse       0.31      0.31      0.31       584
               AdamCarollaShow       0.37      0.30      0.33       680
                 AliceIsntDead       0.38      0.48      0.42  