In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
!pip install gensim
!pip install tabulate

In [0]:
import nltk
nltk.download('wordnet')
nltk.download("stopwords")

In [5]:
import sys


%matplotlib inline

# basic python libraries

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np


# some structures
from collections import Counter, defaultdict
from tabulate import tabulate
import struct

#some models
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC


# for preprocessing / model evaluation ...
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedShuffleSplit

# for productivity
from sklearn.pipeline import Pipeline



# NLP /related /libraries

from gensim.models.word2vec import Word2Vec

# for cleaning ..
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import SnowballStemmer

import regex as re



# some statiscal features ... BOW approach :

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer



In [0]:
stop_words = stopwords.words('english')


train_data = pd.read_csv("gdrive/My Drive/Colab Notebooks/train.csv")
test_data = pd.read_csv("gdrive/My Drive/test.csv")

y = pd.read_csv("gdrive/My Drive/Colab Notebooks/train.csv").Label



train_tweets = train_data.TweetText[:]
test_tweets = test_data.TweetText[:]

In [0]:
FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text) :
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = " {} ".format(hashtag_body.lower())
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result
  
def allcaps(text) :
    text = text.group()
    return text.lower() + " <allcaps>"
  
def emojis_tokenizer(text):
  eyes = r"[8:=;]"
  nose = r"['`\-]?"
  def re_sub(pattern, repl):
    return re.sub(pattern, repl, text, flags=FLAGS)
  
  text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
  text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
  text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
  text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
  text = re_sub(r"<3","love")
  return text

def apostr_tokenizer(text):
  def re_sub(pattern, repl):
    return re.sub(pattern, repl, text, flags=FLAGS)
  text = re_sub(r"\'ll", " will ") 
  text = re_sub(r"\'s", " ")
  text = re_sub(r"\'m", " am")
  text = re_sub(r"\'d", " would")
  text = re_sub(r"\'ve", " have")
  text = re_sub(r"\'re", " are")
  text = re_sub(r"can't", " can not")
  text = re_sub(r"\'t", " not")
  text = re_sub(r"\'\s", "  ")
  text = re_sub(r"\s\'", "  ")
  text = re_sub(r"\'$", "  ")
  text = re_sub(r"^\'", "  ")
  return text 

def fc_mapping(data) :
  #  tokens that ends with fc are mapped to => football club 
  for i in range(len(data)) :
    fc = ["football","club"]
    for j in data[i] :
      l = re.findall("\w+fc",j)
      if len(l)>0 :
        data[i]+=fc
        break
        
def tokenize(text):
    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)  
    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    
    #text = re_sub(r"@\w+", " ")  # removing this ones gave a good accuracy boost
    
    # dealing with emojis :
    text = emojis_tokenizer(text)
    
    
    text = re_sub(r"/"," / ")
    text = re_sub(r"\\"," ")
    
    # numbers / hashtags / elong / allcaps :
    
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    text = re_sub(r"([A-Z]){2,}", allcaps)
    
    # Some additional cleaning :
    text = re_sub(r"[\,\:\!\@\#\$\%\^\&\*\(\)\_\+\|\~\=\-\{\}\[\]\?\/\.\"\;`]", " ") # punc
    text = re_sub(r"[\<]", " <")
    text = re_sub(r"[\>]", "> ")
    
    text = re_sub(r"fc\s", "fc football club ")
    
    
    # dealing with \'
    text = apostr_tokenizer(text)
    
    
    text = text.lower()
    
    # some simple mapping :
    
    if "gp" in text :
      text = re_sub(r"gp\s", "gp grand prix ")
    if "fc" in text :
      text = re_sub(r"fc\s", "fc football club ")
        
    return text

  
def tokenize_(liste_tweets) :
  tokens = [re.findall(r"\S+",tokenize(i)) for i in liste_tweets]
 
  #for i in range(len(tokens)) :
   #   tokens[i] = [ j for j in tokens[i] if j not in stop_words and len(j)>1 ] 
      
  return tokens

In [0]:
train_tokens = tokenize_(train_tweets)
test_tokens = tokenize_(test_tweets)

In [15]:
mult_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])

mult_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])



all_models = [
    ("mult_nb", mult_nb),
    ("mult_nb_tfidf", mult_nb_tfidf),
]

unsorted_scores = [(name, cross_val_score(model, train_tokens, y, cv=10).mean()) for name, model in all_models]
scores = sorted(unsorted_scores, key=lambda x: -x[1])


print (tabulate(scores, floatfmt=".4f", headers=("model", 'score')))

model            score
-------------  -------
mult_nb         0.9585
mult_nb_tfidf   0.9577


In [0]:
all_tokens =  train_tokens[:] + test_tokens[:] 

In [0]:
X_tokens = [k for i in all_tokens for k in i]

most_500 = Counter(X_tokens).most_common(500)

freq_most_used_words = [(i,j,freq_(i),1-freq_(i) )for i,j in most_500]
freq_most_used_words_2 = [i for i in freq_most_used_words if ( i[2]>0.94 or i[3]>0.94 ) and "<" not in i[0] and i[2]<=1]

freq_dict = {i[0]:i[2] for i in freq_most_used_words_2}

In [0]:
print(tabulate(freq_most_used_words_2,headers=("Key_Words","Occurencies","Politics","Sports")))

In [0]:
# augment_meaning(train_tokens)
# augment_meaning(test_tokens)

In [0]:
def map_and_unify_and_augment(data,weight_list,dict_):
  for i in range(len(data)) :
    del_ = []
    add_ = []
    for j in range(len(data[i])) :
      for k in weight_list :
        if k[0] in data[i][j] :
          del_ += [data[i][j]]
          add_ +=  dict_[k[0]]
          break
    print(del_,add_)
    for elm in del_ :
      data[i].remove(elm)
    data[i]+=add_

In [0]:
map_and_unify_and_augment(train_tokens,freq_most_used_words_2,unifying_dict)
map_and_unify_and_augment(test_tokens,freq_most_used_words_2,unifying_dict)

In [0]:
train_tokens = tokenize_2(train_tweets) 
test_tokens = tokenize_2(test_tweets) 

print (tabulate(politics_counter, floatfmt=".4f", headers=("KeyWords", 'Occurencies')))

In [0]:
submit_("nothing2.csv",mult_nb)

In [0]:
def submit_(string_,clf) :
  test_id = test_data.TweetId
  clf.fit(train_tokens,y)
  prediction = clf.predict(test_tokens)
  df = pd.DataFrame(data={"TweetId":test_id,"Label":prediction})
  sub  = df.to_csv(string_,index=False)

In [0]:
def find_s(X_,str_,int_) :
  for i in range(len(X_)) :
    for j in X_[i] :
      if str_ in j :
        if int_ == 1 :
          if j == str_ :
            if i<len(X) :
               print(j," >>>>> ",y[i])
            else :
              print(j," >>>>> ")          
        else : 
          if i<len(X) :
            print(j," >>>>> ",y[i])
          else :
            print(j," >>>>> ")
            
            
            
def map_words(X_) :
  for i in range(len(X_)) :
    for j in range(len(X_[i])) :

      del_ = []
      add_ = []

      for mapp in dic_map :
        if mapp in X_[i][j] :
          del_ += [X_[i][j]]
          add_ +=  dic_map[mapp]
          break

      for elm in del_ :
        X_[i].remove(elm)

      X_[i]+=add_
    X_[i] = [j for j in X_[i] if len(j)>1]            

def map_words2(X_) :
  for i in range(len(X_)) :
    for j in range(len(X_[i])) :

      del_ = []
      add_ = []

      for mapp in dic_map2 :
        if mapp in X_[i][j] :
          del_ += [X_[i][j]]
          add_ +=  dic_map2[mapp]
          break

      for elm in del_ :
        X_[i].remove(elm)

      X_[i]+=add_
    X_[i] = [j for j in X_[i] if len(j)>1] 
            
dic_map = {
           "football":["football","sports"],
           "tennis":["tennis","sports"],
           "basket":["basketball","sports"] ,  
           "cricket":["cricket","sports"],
    
           "ausg":["ausgp","sports","australian grand prix"],
          "motogp":["grand", "prix", "motorcycle" , "racing"],
          "bbl":["bbl","sports"],
           
           "obama":["obama","politics","president"],
           "gov" :["government","politics"],
           "kerry": ["seckerry","politics","secretary"],
            "mandela":["mandela","politics","president"],
            "putin":["putin","politics","president"],
    
            "job" : ["jobs","politics"]
          }

dic_map2 = {
           "football":["football"],
           "tennis":["tennis"],
           "basket":["basketball"] ,  
           "cricket":["cricket"],
           "ausg":["ausgp"],
          "bbl":["bbl"],
          
           "obama":["obama"],
           "gov" :["government"],
           "kerry": ["seckerry"],
            "mandela":["mandela"],
           "job" : ["jobs"]
          }

In [0]:
def find_and_group(word):
  parents = []
  for i in X_tokens :
    if word in i :
      parents +=[i]
  c = Counter(parents).most_common(5)
  print(tabulate(c,headers=("Parents","Occurencies")))
  return

In [0]:
def freq_(s) :
  all_ = 0
  pol = 0
  for i in range(len(train_tokens)) :
    for j in train_tokens[i] :
      if s in j :
        all_ += 1
        if y[i] == "Politics" :
          pol += 1 
  if all_ == 0 :
    return 10
  return pol/all_


def augment_meaning(data) :
  for i in range(len(data)):
    list_add = []
    for j in data[i] :
      for elmt in freq_dict :
        if j == elmt :
          pol_freq = freq_dict[elmt]
          if pol_freq > 0.9 :
            list_add += ["poltics"]
          else :
            list_add +=["sports"]
    data[i] += list_add

In [0]:
# automatic solution :

for each tag :
  check how many times it is counted as politics and sports :
    if its more than 95% in either one of them :
      delete every string with this tag as a substring and insert it with sports or politics.
      
     