In [0]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import coo_matrix, hstack
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import string
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

In [0]:
###Reading the training data
l1=[]
l2=[]
with open('train_tweets.txt',encoding ='utf-8') as f:
    for line in f:
        line = line.split(maxsplit=1)
        l1.append(line[0])
        l2.append(line[1])
        
        
df = pd.DataFrame([l1,l2],index=['user_id','tweet']).T


In [0]:
##Preprocessing of tweets which are having length lower than 5 and higher than 30
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
def preprocessing(text):
    if (len(text.split())>5 and len(text.split())<30):
       tokens = word_tokenize(text)
        tokens = [w.lower() for w in tokens]
        # remove punctuation from each word
        import string
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]
        # filter out stop words
        from nltk.corpus import stopwords
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        porter = PorterStemmer()
        stemmed = [porter.stem(word) for word in tokens]

        return " ".join(stemmed)
    else:
        return np.nan

In [0]:
df['preprocessed_tweet'] = df['tweet'].apply(lambda x:preprocessing(x))
df= df.dropna(axis=0)
df =df.groupby('user_id').filter(lambda x : len(x)>20)## droping the user_id and tweets who have less than 20 tweets
df.shape

In [0]:
##reading the test data
l3=[]

with open('test_tweets_unlabeled.txt',encoding ='utf-8') as f:
    for line in f:
        l3.append(line)
testdf = pd.DataFrame([l3],index=['tweet']).T

In [0]:
###Genration of counts for numeric features
%%time
df['char_count'] = df['tweet'].apply(len)
df['word_count'] = df['tweet'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = df['tweet'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
df['title_word_count'] = df['tweet'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df['upper_case_word_count'] = df['tweet'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

testdf['char_count'] = testdf['tweet'].apply(len)
testdf['word_count'] = testdf['tweet'].apply(lambda x: len(x.split()))
testdf['word_density'] = testdf['char_count'] / (testdf['word_count']+1)
testdf['punctuation_count'] = testdf['tweet'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
testdf['title_word_count'] = testdf['tweet'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
testdf['upper_case_word_count'] = testdf['tweet'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [0]:
##Functions to check presence of urls, handles and RTs
def checkHandle(text):
    if '@handle' in text.split():
        return 1
    else:
        return 0


def checkRT(text):
    try:
        if 'RT' in text.split():
            return 1
        else:
            return 0
        
    except:
        return 0

def checkURL(text):
    try:    
        if 'http' in text:
            return 1
        else:
            return 0
    
    except:
        return 0



In [0]:
df['@handle'] = df['tweet'].apply(lambda x : checkHandle(x))
testdf['@handle'] = testdf['tweet'].apply(lambda x : checkHandle(x))
df['RT'] = df['tweet'].apply(lambda x : checkRT(x))
testdf['RT'] = testdf['tweet'].apply(lambda x : checkRT(x))
testdf['tweet_length'] = testdf['tweet'].apply(lambda x:checkLength(x))
df['tweet_length'] = df['tweet'].apply(lambda x:checkLength(x))

In [0]:
##Normalizing the counts of the data frames
x_train = df[df.columns[3:55]]
train_norm = x_train[x_train.columns[0:16]]
from sklearn import preprocessing
std_scale = preprocessing.StandardScaler().fit(train_norm)
x_train_norm = std_scale.transform(train_norm)
training_norm_col = pd.DataFrame(x_train_norm, index=train_norm.index, columns=train_norm.columns) 
x_train.update(training_norm_col)
print (x_train.head())

x_test = testdf[testdf.columns[1:55]]
test_norm = x_test[x_test.columns[:16]]
x_test_norm = std_scale.transform(test_norm)
testing_norm_col = pd.DataFrame(x_test_norm, index=test_norm.index, columns=test_norm.columns) 
x_test.update(testing_norm_col)

In [0]:
## creation of count vectors
word_vectorizer = CountVectorizer(analyzer='word',encoding = 'utf-8',stop_words='english', max_features=7000)
char_vectorizer = CountVectorizer(encoding='utf-8',analyzer='char',max_features=500)
word_vectorizer.fit(df['preprocessed_tweet'])
char_vectorizer.fit(df['tweet'])
xtrain1word = word_vectorizer.transform(x_train)
xtrain1char = char_vectorizer.transform(x_train)
x_train1 = hstack([xtrain1word,xtrain1char])

y_train = df.user_id
##Count vectors for test
xtestword = word_vectorizer.transform(testdf['tweet'])
xtestcount = char_vectorizer.transform(testdf['tweet'])
x_test = hstack([xtestword,xtestcount])

In [0]:
# word level tf-idf
tfidf_vectword = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=1500)
# = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', max_features=1000)
tfidf_vectword.fit(df['preprocessed_tweet'])
#tfidf_vectchar.fit(df['preprocessed_tweet'])
xtrain_tfidf =  tfidf_vectword.transform(x)
#xtrain_tfidfchar = tfidf_vectchar.transform(df['tweet'])
#xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [0]:
##training the model
%%time
mb = MultinomialNB(alpha=0.08)
mb.fit(x_train1,y_train)

y_pred = mb.predict(x_test_test)
print(mb.score(x_train1,y_train))


In [0]:
submission = pd.DataFrame({"Id": [i+1 for i in range(len(y_pred))], "Predicted": y_pred})
pd.DataFrame(submission).to_csv("submission.csv", index = None)