## Build supervised spam detector

- Dataset to be used for traning: "Spam Training.csv"
- Fit and transform the training data X_train using a Count Vectorizer ignoring terms that have a document frequency strictly lower than **5** and using **character n-grams from n=2 to n=5.**

- To tell Count Vectorizer to use character n-grams pass in `analyzer='char_wb'` which creates character n-grams only from text inside word boundaries. This should make the model more robust to spelling mistakes.

- Using this document-term matrix and the following additional features:
    * the length of document (number of characters)
    * number of digits per document
    * **number of non-word characters (anything other than a letter, digit or underscore.)**

- fit a Logistic Regression model with regularization C=100. Then compute the area under the curve (AUC) score using the transformed test data.

#### Spam Model

In [3]:
def spam_model():
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    import re
    import numpy as np
    from scipy.sparse import csr_matrix, hstack

    # Import and tranform spam dataset
    spam_data = pd.read_csv('../data/4.Spam Training.csv')
    spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
    X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], spam_data['target'], random_state=0)

    # vectorize train, test sets
    spam_vect=CountVectorizer(min_df=5, analyzer='char_wb', ngram_range=(2,5)).fit(X_train)
    X_train_vectorized = spam_vect.transform(X_train)
    X_test_vectorized = spam_vect.transform(X_test)

    # add features
    num_digit_X_train=X_train.apply(lambda x:len(re.findall(r'\d', x)))
    num_non_word_X_train=X_train.apply(lambda x:len(re.findall(r'\W', x)))
    X_train_vectorized_len_digit_nonword=hstack([X_train_vectorized, csr_matrix([X_train.str.len(), 
                                                                                 num_digit_X_train, 
                                                                                 num_non_word_X_train]).T], 'csr')

    num_digit_X_test=X_test.apply(lambda x:len(re.findall(r'\d', x)))
    num_non_word_X_test=X_test.apply(lambda x:len(re.findall(r'\W', x)))
    X_test_vectorized_len_digit_nonword=hstack([X_test_vectorized,csr_matrix([X_test.str.len(),
                                                                            num_digit_X_test,
                                                                            num_non_word_X_test]).T],'csr')

    # Creat prediction model
    spam_model=LogisticRegression(C=100).fit(X_train_vectorized_len_digit_nonword,y_train)

    # Evaluate the model
    feature_names = np.array(spam_vect.get_feature_names()+['length_of_doc', 'digit_count', 'non_word_char_count'])
    sorted_coef_index = spam_model.coef_[0].argsort()
    smallest = feature_names[sorted_coef_index[:10]]
    largest = feature_names[sorted_coef_index[:-11:-1]]
    spam_predictions = spam_model.predict(X_test_vectorized_len_digit_nonword)
    print('test score:', roc_auc_score(y_test, spam_predictions))
    print('list of smallest coef index:', list(smallest))
    print('list of largest coef index:', list(largest))
    
    return spam_vect, spam_model

spam_vect, spam_model=spam_model()

test score: 0.9788593110707434
list of smallest coef index: ['. ', '..', '? ', ' i', ' y', ' go', ':)', ' h', 'go', ' m']
list of largest coef index: ['digit_count', 'ne', 'ia', 'co', 'xt', ' ch', 'mob', ' x', 'ww', 'ar']


In [4]:
def spam_predict(docs):
    from scipy.sparse import csr_matrix, hstack
    import re
    docs_vectorized = spam_vect.transform(docs)
    num_digit=docs.apply(lambda x:len(re.findall(r'\d', x)))
    num_nonword=docs.apply(lambda x:len(re.findall(r'\W', x)))
    docs_vectorized_len_digit_nonword=hstack([docs_vectorized,csr_matrix([docs.str.len(),
                                                                    num_digit,
                                                                    num_nonword]).T],'csr')
    return spam_model.predict(docs_vectorized_len_digit_nonword)

###### Note: Need better spam dataset, or try different spam detection model later

In [15]:
import pandas as pd
df=pd.read_csv('../data/3.pulledTweets-English.csv').drop_duplicates(subset=['text']).reset_index()
df.text

0      "House of Cards" has always hit differently 🎂 ...
1      POWER A Enhanced Wireless Controller for Ninte...
2      Select 1st party Switch $39.99 physical/digita...
3      Silicone Eye Pad for Oculus Quest 2 Face Cushi...
4      Power Strip Surge Protector- 5 Outlets 3 USB P...
                             ...                        
227    Sometimes you get a third chance - Love's Thir...
228    Check out this Amazon deal: The Art of My Neig...
229    RT @bigtickHK: First Steps: How Upright Walkin...
230    RT @bigtickHK: The First Day of Spring by Nanc...
231    ad: $29.99 (50% off) \n \nGalaxy Star Projecto...
Name: text, Length: 232, dtype: object

In [16]:
df=df[df.text!='']
df

Unnamed: 0.1,index,Unnamed: 0,created_at,id,text,language
0,0,0,Sun May 02 07:59:00 +0000 2021,1.388765e+18,"""House of Cards"" has always hit differently 🎂 ...",en
1,1,1,Sun May 02 22:11:00 +0000 2021,1.388979e+18,POWER A Enhanced Wireless Controller for Ninte...,en
2,2,2,Sun May 02 14:21:00 +0000 2021,1.388861e+18,Select 1st party Switch $39.99 physical/digita...,en
3,3,3,Mon May 03 23:59:58 +0000 2021,1.389369e+18,Silicone Eye Pad for Oculus Quest 2 Face Cushi...,en
4,4,4,Mon May 03 23:59:52 +0000 2021,1.389369e+18,Power Strip Surge Protector- 5 Outlets 3 USB P...,en
...,...,...,...,...,...,...
227,251,251,Tue Apr 27 23:58:10 +0000 2021,1.387194e+18,Sometimes you get a third chance - Love's Thir...,en
228,252,252,Tue Apr 27 23:58:10 +0000 2021,1.387194e+18,Check out this Amazon deal: The Art of My Neig...,en
229,253,253,Tue Apr 27 23:58:08 +0000 2021,1.387194e+18,RT @bigtickHK: First Steps: How Upright Walkin...,en
230,254,254,Tue Apr 27 23:58:04 +0000 2021,1.387194e+18,RT @bigtickHK: The First Day of Spring by Nanc...,en


In [23]:
def clean_spams(file):
    import pandas as pd
    df=pd.read_csv(file).drop_duplicates(subset=['text']).reset_index()
   
    import re
    for i in range(len(df)):
        try:
            txt = df.loc[i]["text"]
            txt=re.sub(r'@[A-Z0-9a-z_:]+','',txt)#replace username-tags
            txt=re.sub(r'^[RT]+','',txt)#replace RT-tags
            txt = re.sub('https?://[A-Za-z0-9./]+','',txt)#replace URLs
            txt=re.sub("[^a-zA-Z]", " ",txt)#replace hashtags
            df.at[i,"text"]=txt
        except:
            df.at[i,"text"]=''
            continue

    df['spam']=spam_predict(df['text'])
    df.to_csv('../data/4.Tweet data before clear spams.csv')
    del df['Unnamed: 0']
    df=df[df['spam']==0]
    del df['spam']
    df=df.where(df.text!='')
    df.to_csv('../data/4.Tweet data after clear spams.csv')
    return df['text']

In [24]:
clean_spams('../data/3.pulledTweets-English.csv')

0       House of Cards  has always hit differently   ...
2      Select  st party Switch        physical digita...
3      Silicone Eye Pad for Oculus Quest   Face Cushi...
4      Power Strip Surge Protector    Outlets   USB P...
6       Hi  You can turn off the microphone  When thi...
                             ...                        
227    Sometimes you get a third chance   Love s Thir...
228    Check out this Amazon deal  The Art of My Neig...
229      First Steps  How Upright Walking Made Us Hum...
230      The First Day of Spring by Nancy Tucker    B...
231    ad              off     Galaxy Star Projector ...
Name: text, Length: 224, dtype: object