In [13]:
import pandas as pd
import nltk
import os
from collections import Counter
from cleantext import clean
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

### Create Corpus
By concatenating all articles, we achieve huge corpus for both football and basketball news

In [2]:
bdf = pd.read_json("clean/basketball-stem-lem.jl", lines = True)
fdf = pd.read_json("clean/football-stem-lem.jl", lines = True)

def create_corpus(df, name):
    corpus = ""
    for i, row in df.iterrows():
        if i%1000==0:
            print("[Corpus Create] {} {:.2f}%".format(name,((i+1)/len(df))*100))
        corpus+=row["content"]
        corpus+=" "
    return corpus

b_corpus = create_corpus(bdf,"basketball news")
f_corpus = create_corpus(fdf,"football news")


[Corpus Create] basketball news 0.01%
[Corpus Create] basketball news 10.01%
[Corpus Create] basketball news 20.02%
[Corpus Create] basketball news 30.02%
[Corpus Create] basketball news 40.03%
[Corpus Create] basketball news 50.03%
[Corpus Create] basketball news 60.03%
[Corpus Create] basketball news 70.04%
[Corpus Create] basketball news 80.04%
[Corpus Create] basketball news 90.05%
[Corpus Create] football news 0.00%
[Corpus Create] football news 1.91%
[Corpus Create] football news 3.83%
[Corpus Create] football news 5.74%
[Corpus Create] football news 7.65%
[Corpus Create] football news 9.56%
[Corpus Create] football news 11.47%
[Corpus Create] football news 13.38%
[Corpus Create] football news 15.30%
[Corpus Create] football news 17.21%
[Corpus Create] football news 19.12%
[Corpus Create] football news 21.03%
[Corpus Create] football news 22.94%
[Corpus Create] football news 24.86%
[Corpus Create] football news 26.77%
[Corpus Create] football news 28.68%
[Corpus Create] football 

### Sentence tokenization
nltk provides `sent_tokenize` which I used in order to get sentence count for both corpora

In [3]:
f_sentence_tokens = nltk.tokenize.sent_tokenize(f_corpus)
b_sentence_tokens = nltk.tokenize.sent_tokenize(b_corpus)

print("football_sentences_count: ",len(f_sentence_tokens))
print("basketball_sentences_count: ",len(b_sentence_tokens))

football_sentences_count:  797716
basketball_sentences_count:  315544


### Data Cleaning
We don't want punctuation marks to be counted. also after reading data from `.jl` files, fixing unicode would help

In [4]:
f_corpus = clean(f_corpus,
            fix_unicode=True,               # fix various unicode errors
            to_ascii=True,                  # transliterate to closest ASCII representation
            lower=False,                     # lowercase text
            no_line_breaks=False,            # fully strip line breaks as opposed to only normalizing them
            no_urls=False,                   # replace all URLs with a special token
            no_emails=False,                # replace all email addresses with a special token
            no_phone_numbers=False,         # replace all phone numbers with a special token
            no_numbers=False,                # replace all numbers with a special token
            no_digits=False,                 # replace all digits with a special token
            no_currency_symbols=False,      # replace all currency symbols with a special token
            no_punct=True,                  # remove punctuations
            replace_with_punct="",
            lang="en"
        )
b_corpus = clean(b_corpus,
            fix_unicode=True,               # fix various unicode errors
            to_ascii=True,                  # transliterate to closest ASCII representation
            lower=False,                     # lowercase text
            no_line_breaks=False,            # fully strip line breaks as opposed to only normalizing them
            no_urls=False,                   # replace all URLs with a special token
            no_emails=False,                # replace all email addresses with a special token
            no_phone_numbers=False,         # replace all phone numbers with a special token
            no_numbers=False,                # replace all numbers with a special token
            no_digits=False,                 # replace all digits with a special token
            no_currency_symbols=False,      # replace all currency symbols with a special token
            no_punct=True,                  # remove punctuations
            replace_with_punct="",
            lang="en"
        )

In [5]:
with open(os.path.join("corpus","basketball_corpus.txt"),"wb") as basketball_corpus:
    basketball_corpus.write(b_corpus.encode("utf-8"))

with open(os.path.join("corpus","football_corpus.txt"),"wb") as football_corpus:
    football_corpus.write(f_corpus.encode("utf-8"))

### Remove Number & URL tokens
All URLs and numbers were replaced to special tokens in data-cleaning process, we remove them as we don't need to take them into account while counting.

In [6]:
f_corpus = f_corpus.replace("<number>","")
f_corpus = f_corpus.replace("<url>","")
b_corpus = b_corpus.replace("<number>","")
b_corpus = b_corpus.replace("<url>","")


In [7]:
f_tokens = nltk.word_tokenize(f_corpus)
b_tokens = nltk.word_tokenize(b_corpus)

In [8]:
print("football_words_count: ", len(f_tokens))
print("basketball_words_count: ", len(b_tokens))

f_tokens_set = set(f_tokens)
b_tokens_set = set(b_tokens)

print("football_unique_words_count: ", len(f_tokens_set))
print("basketball_unique_words_count: ", len(b_tokens_set))

print("common_words_count: ", len(f_tokens_set-(f_tokens_set-b_tokens_set)))
print("football_different_words_count: ", len(f_tokens_set-b_tokens_set))
print("basketball_different_words_count: ", len(b_tokens_set-f_tokens_set))

print("football_articles_count: ", len(fdf))
print("basketball_articles_count: ", len(bdf))


football_words_count:  10005565
basketball_words_count:  3500059
football_unique_words_count:  58969
basketball_unique_words_count:  26278
common_words_count:  15745
football_different_words_count:  43224
basketball_different_words_count:  10533
football_articles_count:  52305
basketball_articles_count:  9996


In [9]:
common_words_set = f_tokens_set-(f_tokens_set-b_tokens_set)
f_uncummon_words = [word for word in f_tokens if word not in common_words_set]
c = Counter(f_uncummon_words)
print(c.most_common(12))

b_uncummon_words = [word for word in b_tokens if word not in common_words_set]
d = Counter(b_uncummon_words)
print(d.most_common(12))


[('talksport', 22174), ('tottenham', 18933), ('mourinho', 12027), ('everton', 10548), ('klopp', 9603), ('ps00million', 8185), ('goalkeep', 8163), ('ps00', 7679), ('newcastl', 7518), ('solskjaer', 7113), ('trafford', 7059), ('guardiola', 6884)]
[('sixer', 2858), ('antetokounmpo', 2739), ('cav', 2637), ('postseason', 2347), ('pacer', 2249), ('doncic', 1943), ('apg', 1873), ('kawhi', 1718), ('lillard', 1711), ('timberwolv', 1500), ('nbacom', 1342), ('kyri', 1238)]


In [10]:
words_union = f_tokens + b_tokens
top_ten_common_words = Counter([w for  w in words_union if w in common_words_set]).most_common(10)
football_words_count = Counter(f_tokens)
basketball_words_count = Counter(b_tokens)
football_words_total_count = len(f_tokens)
basketball_words_total_count = len(b_tokens)


In [18]:
football_normalized_frequencies = {}
basketball_normalized_frequencies = {}
for word in top_ten_common_words:
    word = word[0]
    football_normalized_frequencies[word] = (football_words_count[word]/football_words_total_count)/(basketball_words_count[word]/basketball_words_total_count)
    basketball_normalized_frequencies[word] = (basketball_words_count[word]/basketball_words_total_count)/(football_words_count[word]/football_words_total_count)

print(sorted(football_normalized_frequencies.items(), key=lambda x: x[1], reverse=True)[:10])
print(sorted(basketball_normalized_frequencies.items(), key=lambda x: x[1], reverse=True)[:10])


[('leagu', 2.205488103743583), ('I', 1.2751488835691884), ('s', 1.259293332036864), ('year', 1.1757236671289504), ('player', 0.9657418580177081), ('say', 0.8070262750766966), ('play', 0.794789046616545), ('be', 0.7837811764435959), ('season', 0.6728599571040706), ('game', 0.41019653524962174)]
[('game', 2.437855793665976), ('season', 1.486193359319391), ('be', 1.27586631327062), ('play', 1.2581954976066263), ('say', 1.2391170286307764), ('player', 1.0354733945700672), ('year', 0.8505399933318877), ('s', 0.7940961605685104), ('I', 0.7842221507507133), ('leagu', 0.4534143704074421)]


In [32]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([f_corpus, b_corpus])

feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)


print(df)

         00       000      0000   0000all    0000am    0000bc  0000blog  \
0  0.001619  0.000407  0.000618  0.000000  0.000033  0.000004  0.000011   
1  0.001640  0.000060  0.000975  0.000009  0.000000  0.000000  0.000000   

    0000bst  0000calcium   0000cet  ...      zych      zyci   zydruna  \
0  0.000014     0.000004  0.000004  ...  0.000004  0.000004  0.000000   
1  0.000000     0.000000  0.000000  ...  0.000000  0.000000  0.000085   

       zygi  zygimanta     zylan     zynex      zyro       zyz        zz  
0  0.000000   0.000000  0.000000  0.000004  0.000011  0.000004  0.000004  
1  0.000019   0.000009  0.000066  0.000000  0.000000  0.000000  0.000000  

[2 rows x 68723 columns]


In [56]:
print("top 10 tf-idf football:")
print(df.sort_values(by=0,axis=1,ascending=False).iloc[:, : 10][:1])
print("*"*80)
print("top 10 tf-idf basketball:")
print(df.sort_values(by=1,axis=1,ascending=False).iloc[:, : 10][1:])


top 10 tf-idf football:
      leagu      club    season        be    player      year       say  \
0  0.219325  0.202116  0.172411  0.170882  0.168717  0.161329  0.155355   

      game      play        he  
0  0.15407  0.147864  0.139387  
********************************************************************************
top 10 tf-idf basketball:
       game     point    season      team      nba        be       say  \
1  0.343101  0.235906  0.234065  0.219021  0.20225  0.199157  0.175846   

       play    player       get  
1  0.169944  0.159585  0.133551  


In [66]:
histogram = Counter(words_union)
histogram = sorted(histogram.items(), key=lambda pair: pair[1], reverse=True)


In [73]:
print(histogram[:40])

[('s', 190032), ('game', 110895), ('season', 101799), ('leagu', 98717), ('be', 96012), ('player', 89284), ('say', 86512), ('play', 82724), ('I', 82416), ('year', 81320), ('club', 81082), ('team', 76518), ('get', 72828), ('he', 71397), ('go', 67812), ('make', 61870), ('one', 60968), ('last', 59526), ('point', 58272), ('first', 56440), ('back', 55793), ('time', 54207), ('we', 54099), ('goal', 51517), ('unit', 51357), ('good', 50333), ('win', 48256), ('two', 46909), ('come', 44937), ('see', 43837), ('premier', 43288), ('well', 42655), ('take', 42243), ('would', 41776), ('citi', 40365), ('like', 39787), ('think', 38861), ('also', 38616), ('old', 38265), ('it', 37899)]
