In [1]:
# Import required modules
import pandas as pd
import re
from collections import Counter
from itertools import chain
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [2]:
# Read in the data
df = pd.read_csv("final_output.csv")
df.head()

Unnamed: 0,date,tweets,tokens,category,sentiment
0,2019-11-16 20:47:54,J ust used my @BankofAmerica Mobile App to pay...,"['j', 'ust', 'use', 'mobil', 'app', 'pay', 'bi...",neutral,positive
1,2019-11-01 22:20:12,We just marched near @PeteButtigieg and Chaste...,"['march', 'near', 'chasten', 'cowl', 'common',...",performance,positive
2,2019-08-18 02:04:47,@AskCiti @Citi @Citibank #india Wat policy 2 a...,"['india', 'wat', 'polici', '2', 'accept', 'mgn...",neutral,positive
3,2019-03-03 23:34:48,Sh** purely for sport I need a 30 for 30.. @We...,"['sh', 'pure', 'sport', 'need', '30', '30', '....",competetive,positive
4,2019-10-17 22:17:45,"115 years ago, A.P. Giannini opened the Bank o...","['115', 'year', 'ago', 'p', 'giannini', 'open'...",competetive,neutral


In [3]:
# This func processes tweets
def process_tweets(sentence):
    
    # Split the strings and remove username(any word that starts with @)
    no_username = [re.sub("@.*", "", x) for x in sentence.lower().split()]
    
    # Split by /
    flattened = list(chain(*[x.split("/") for x in no_username]))
    
    # Remove web address ( any url that ends with .com)
    no_special_char = [re.sub(r"\w?.*com$", "", x) for x in flattened]
    
    # Remove everything except alphanumeric and _
    no_special_char = [re.sub(r"\W+", "", x) for x in no_special_char]
    
    # Remove _ too
    no_special_char = [re.sub(r"_", "", x) for x in no_special_char]
    
    # Remove enpty sequence
    no_special_char = list(filter(lambda x:x, no_special_char))
    return no_special_char


# Apply the  func
df["tweets_processed"] = df.tweets.apply(lambda x:process_tweets(x)).str.join(" ")

In [4]:
# Extract feature vector and response vector
X = df.tweets_processed
y = df["category"]

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.3, random_state=2)


# Logistic regression is our final model
# Train logistic regression now
logreg = Pipeline([("vect", CountVectorizer()),
                ("tfidf", TfidfTransformer()),
                ("clf", LogisticRegression(n_jobs=-1, C=1e5)), # Use all cores to speed up
               ])

# Fit the model
logreg.fit(X_train, y_train)

# Make prediction on test set
y_pred_logreg = logreg.predict(X_test)

# Calculate accuracy
logreg_accuracy = round(accuracy_score(y_test, y_pred_logreg)*100, 2)
print(f"Logistic regression accuracy is: {logreg_accuracy}%")

Logistic regression accuracy is: 92.94%


In [5]:
# This func is used to predict future tweets category
def make_prediction(tweet):
    tweet_proessed = " ".join(process_tweets(tweet))
    pred = logreg.predict([tweet_proessed])
    return pd.DataFrame({
        "tweet":tweet,
        "predicted_category":pred
    })

In [6]:
# Let's preview he data. For every run it will be different
random_sample = df.sample(5)
random_sample

Unnamed: 0,date,tweets,tokens,category,sentiment,tweets_processed
23361,2019-08-10 14:46:30,@support_a_teach this is my third year teachin...,"['third', 'year', 'teach', 'kindergarten', 'ho...",competetive,positive,this is my third year teaching kindergarten ho...
56111,2019-03-12 19:24:55,JPMorgan Chase & Co. plans to charge just 20 c...,"['jpmorgan', 'chase', 'co', 'plan', 'charg', '...",performance,neutral,jpmorgan chase co plans to charge just 20 cent...
116944,2019-02-17 19:23:41,Don’t Call JP Morgan Chase’s New ‘JPM Coin’ a ...,"['call', 'jp', 'morgan', 'chase', 'new', '‘', ...",competetive,positive,dont call jp morgan chases new jpm coin a cryp...
6981,2019-05-06 22:16:02,@politiconj This is about what hapened to me b...,"['hapen', 'bt']",neutral,neutral,this is about what hapened to me bt https entr...
64025,2019-09-03 10:03:39,"JP Morgan Chase & Co.: Software Engineering, J...","['jp', 'morgan', 'chase', 'co', 'softwar', 'en...",neutral,positive,jp morgan chase co software engineering java d...


In [7]:
# Let's check if prediction is okay. Look's great!
pd.concat(list(map(make_prediction, df.tweets.iloc[random_sample.index]))) 

Unnamed: 0,tweet,predicted_category
0,@support_a_teach this is my third year teachin...,competetive
0,JPMorgan Chase & Co. plans to charge just 20 c...,performance
0,Don’t Call JP Morgan Chase’s New ‘JPM Coin’ a ...,competetive
0,@politiconj This is about what hapened to me b...,neutral
0,"JP Morgan Chase & Co.: Software Engineering, J...",neutral


In [8]:
# Or you can use a raw string
make_prediction("@AskCiti @Citi @Citibank #india Wat policy 2 accept MGNREGA job card as address proof? Will NREGA beneficiaries apply 4 ur credit card!? Instead include other valid address proof 4 KYC compliance- electricity bill/lpg/landline bill- my KYC/ address proof issue unsettled 4 months")

Unnamed: 0,tweet,predicted_category
0,@AskCiti @Citi @Citibank #india Wat policy 2 a...,neutral
