### ML-for-NLP2: Homework
<div class="alert alert-success">
    <b>Challenge 1 </b>:
     <ul>

What are the top ten words mentioned by Biden in the 105 congress (after stopwords removal)? For each most frequent word, find the 10 most simlar words generated using word2vec. Find the most frequent bigrams in the text. Explore some bigrams and figure out if the tokens appear in the list of most similar words.

   </ul>

</div>

In [1]:
import numpy as np
from numpy import array
import pickle
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [2]:
# opening Biden speech text
file = open('Inputs/105-extracted-date/105-biden-de.txt', "r").read()
soup = BeautifulSoup(file) #use BeautifulSoup library for parsing contents since the document have XML format
doc = ''
for word in soup.find_all('text'): #loop for extracting only text within <text> tag
    find_all_syntax=word.get_text()
    doc = doc + find_all_syntax

In [3]:
# splitting the speech by paragraphs and creating a list of paragraphs
pars = re.split('\n| \n', doc)
pars = [x for x in pars if x.strip()]

In [4]:
#the stopword list provided by the NLTK library
stop_words1 = stopwords.words('english')

#the stopword list provided by the professor
drop_file = open('Inputs/droplist.txt', "r").read()
drop_file = drop_file.replace('"', '')
drop_list = drop_file.split("\n")
stop_words2 = drop_list[1:]
stop_words = set(stop_words1+stop_words2)

In [5]:
def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [6]:
def preprocessing_text(text):
    text = re.sub(r'[^A-Za-z0-9 ]+', ' ', text) #remove all non‐alphanumeric characters except white space
    words = word_tokenize(text.lower())
    tokens = [word for word in words if word not in stop_words]
    tokens = [token for token in tokens if not any(c.isdigit() for c in token)] #remove everything containing digits
    tokens = [token for token in tokens if len(token)>=2]
    lemmatizer = WordNetLemmatizer()
    tokens_lematized = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    tokens_lematized = [token for token in tokens_lematized if token not in stop_words]
    preprocessed_text = ' '.join(tokens_lematized)
    return preprocessed_text 

The preprocessing takes some time, so once done I saved it as a pickle file and used the saved pickle file when I rerun the code.  

In [7]:
with open("Inputs/preprocessed_biden105.pkl", "rb") as fp:   # Unpickling
    text_biden = pickle.load(fp)
    
text = text_biden

In [8]:
text[:5]

['president pleased senate passing substitute amendment hopeful body pas measure congress adjourns',
 'legislation simple current federal law faith credit child custody custody define include visitation evidence country court automatically recognize visitation particularly visitation child parent grandparent court suppose honor arduous process',
 'legislation simply clarifies faith credit law include visitation absolutely court visitation enter consistently provision federal faith credit statute faith credit narrow legal sense current federal law law explicit hopefully eliminate hassle obstacle delay confront valid visitation federal law follow',
 'president author idea representative rob andrew jersey deserves credit bring issue attention day introduce bill visitation tireless push passage commend congratulate',
 'finally thank senator willingness move bill final day session lot press issue lose final crunch chairman staff gracious pas bill']

In [9]:
from gensim.models import Phrases

sentence_stream = [par.split(" ") for par in text]
bigram = Phrases(sentence_stream, min_count=20, threshold=2)

In [10]:
text_bigram =[]

for par in text:
    bigram_tokens = bigram[par.split()]
    preprocessed_text = ' '.join(bigram_tokens)
    text_bigram.append(preprocessed_text)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
count = vectorizer.fit_transform(text_bigram)

In [12]:
df = pd.DataFrame(count.toarray().transpose(),index=vectorizer.get_feature_names())
df.loc[:,'Total'] = df.sum(axis=1) 

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5791,5792,5793,5794,5795,5796,5797,5798,5799,Total
abandon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
abandonment,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
abate,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
abbey,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
abduction,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34
zubak,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
zumwalt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
zurich,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
df_sorted = df.sort_values(by='Total', ascending=False)
df_sorted.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5791,5792,5793,5794,5795,5796,5797,5798,5799,Total
president,0,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1359
senator,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,817
nato,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,750
time,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,653
country,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,575
amendment,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,531
bill,0,0,0,1,2,1,2,1,0,0,...,0,0,0,0,0,0,0,0,0,521
senate,1,0,0,0,0,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,478
united,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,444
treaty,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,442


In [14]:
top10 = list(df_sorted.index[:10])
top10

['president',
 'senator',
 'nato',
 'time',
 'country',
 'amendment',
 'bill',
 'senate',
 'united',
 'treaty']

In [15]:
def find_bigram(text):
    words = word_tokenize(text)
    tokens = [word for word in words if '_' in word]
    bigrams = ' '.join(tokens)
    return bigrams

In [16]:
all_bigrams = [find_bigram(par) for par in text_bigram]

In [17]:
bigram_count = vectorizer.fit_transform(all_bigrams)

In [18]:
df_bigram = pd.DataFrame(bigram_count.toarray().transpose(),index=vectorizer.get_feature_names())
df_bigram.loc[:,'Total'] = df_bigram.sum(axis=1) 

df_bigram

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5791,5792,5793,5794,5795,5796,5797,5798,5799,Total
address_chair,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,35
advice_consent,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,23
american_people,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,48
appropriation_bill,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,27
appropriation_committee,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
world_war,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,56
yield_floor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,92
yield_minute,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,31
yield_time,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,21


In [19]:
df_bigram_sorted = df_bigram.sort_values(by='Total', ascending=False)
df_bigram_sorted.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5791,5792,5793,5794,5795,5796,5797,5798,5799,Total
united_nation,0,0,0,0,0,0,1,2,0,0,...,0,0,0,0,0,0,0,0,0,164
foreign_policy,0,0,0,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,161
chemical_weapon,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,155
nato_enlargement,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,115
foreign_relation,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,110
nuclear_weapon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,96
yield_floor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,92
madam_president,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,75
arm_control,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,72
law_enforcement,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,67


In [20]:
top10_bigram = list(df_bigram_sorted.index[:10])
top10_bigram

['united_nation',
 'foreign_policy',
 'chemical_weapon',
 'nato_enlargement',
 'foreign_relation',
 'nuclear_weapon',
 'yield_floor',
 'madam_president',
 'arm_control',
 'law_enforcement']

In [21]:
sentence_stream = [par.split(" ") for par in text_bigram]

In [22]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=sentence_stream, vector_size=100, window=5, min_count=1, workers=4)

In [23]:
# create a data frame of similar words
df_sim = pd.DataFrame()

for i in top10:
    column = []
    for j in range(10):
        column.append(model.wv.most_similar(i, topn=10)[j][0])
    df_sim[i] = column

In [24]:
df_sim

Unnamed: 0,president,senator,nato,time,country,amendment,bill,senate,united,treaty
0,issue,amendment,russia,friend,democracy,vote,legislation,vote,lead,ratify
1,offer,minute,germany,speak,nation,senator,offer,floor,nation,ratification
2,republican,friend,alliance,yield,russia,floor,president,colleague,ability,sign
3,wish,colleague,country,floor,threat,colleague,introduce,debate,arm,comprehensive
4,debate,vote,european,wish,political,minute,issue,bill,democracy,opponent
5,record,floor,security,colleague,united,friend,record,legislation,attack,pas
6,understand,time,threat,minute,economic,time,pass,president,country,convention
7,hope,senate,europe,debate,american,senate,provision,offer,maintain,agreement
8,suggest,speak,military,understand,alliance,speak,republican,speak,china,require
9,believe,distinguish,american,president,maintain,offer,support,chairman,political,condition


In [25]:
model.wv.most_similar('president', topn=10)

[('issue', 0.9997382164001465),
 ('offer', 0.999714732170105),
 ('republican', 0.9996945261955261),
 ('wish', 0.9996935725212097),
 ('debate', 0.9996784925460815),
 ('record', 0.9996781945228577),
 ('understand', 0.9996644854545593),
 ('hope', 0.9996603727340698),
 ('suggest', 0.9996476173400879),
 ('believe', 0.9996441602706909)]

In [26]:
model.wv.most_similar('bill', topn=10)

[('legislation', 0.9997133612632751),
 ('offer', 0.9996192455291748),
 ('president', 0.9995484948158264),
 ('introduce', 0.9995418787002563),
 ('issue', 0.999530553817749),
 ('record', 0.9995121359825134),
 ('pass', 0.9995085597038269),
 ('provision', 0.9994838833808899),
 ('republican', 0.9994712471961975),
 ('support', 0.9994704127311707)]

In [27]:
df_sim2 = pd.DataFrame()

for i in top10_bigram:
    column = []
    for j in range(20):
        column.append(model.wv.most_similar(i, topn=20)[j][0])
    df_sim2[i] = column

In [28]:
df_sim2

Unnamed: 0,united_nation,foreign_policy,chemical_weapon,nato_enlargement,foreign_relation,nuclear_weapon,yield_floor,madam_president,arm_control,law_enforcement
0,require,term,convention,serve,arm_service,test,suggest,tell,weapon,public
1,include,step,require,decision,committee,war,majority,name,start,service
2,action,action,china,leader,chairman,china,moment,view,example,criminal
3,result,commitment,protocol,democratic,rank,weapon,talk,issue,action,police
4,body,continue,human,call,hearing,police,disagree,look,step,woman
5,party,result,step,begin,finance,ability,tell,try,china,system
6,establish,mean,ratify,regard,senate,system,congress,reason,word,fight
7,decision,require,list,opportunity,distinguish,democracy,look,add,decision,federal
8,foreign_policy,democratic,iraq,week,colleague,step,believe,approach,recent,child
9,step,plan,include,able,appropriation_committee,attack,name,hope,head,court


In [29]:
model.wv.most_similar('foreign_relation', topn=10)

[('arm_service', 0.9977909922599792),
 ('committee', 0.9972825050354004),
 ('chairman', 0.9968799352645874),
 ('rank', 0.9961227774620056),
 ('hearing', 0.994644284248352),
 ('finance', 0.9945824146270752),
 ('senate', 0.994530975818634),
 ('distinguish', 0.9944438338279724),
 ('colleague', 0.9943565726280212),
 ('appropriation_committee', 0.9940564036369324)]

<div class="alert alert-success">
    <b>Challenge 2</b>:
     <ul>

Use the senator speeches in the folder 105-extracted-date and use doc2vec to find whose senator speech is closest to senator Biden. Use sen105kh_fix.csv and/or Wikipedia to validate your findings (i.e., understand if the most similar speeches are senators from the same state and/party).  Describe your findings. Compare with the outcome you got/will get using cosine similarity.

   </ul>

</div>

Notice: part of the code was taken from the previous group assignment where a similar task was given.

In [30]:
import os
import gensim

list_docs=[]

files = os.listdir('Inputs/105-extracted-date')

for file in files:
    xml = open(os.path.join('Inputs/105-extracted-date', file)).read() 
    soup = BeautifulSoup(xml) #use BeautifulSoup library for parsing contents since documents have XML format
    doc = ''
    for word in soup.find_all('text'): #loop for extracting only text within <text> tag
        find_all_syntax=word.get_text()
        doc = doc + find_all_syntax
    list_docs.append(doc)

> To save time, I use the saved pickle from the previous assignment that contains already preprocessed text (preprocessing is almost the same as above). I will further preprocess the text a bit (previously lemmatization was done without taking part of speech into account). 

In [31]:
with open("Inputs/preprocessed_docs105.pkl", "rb") as fp:   # Unpickling
    speech_list = pickle.load(fp)
    
speech_list[0][:501]

'president debate final passage omnibus appropriation bill american competitiveness workforce improvement act included title subdivision unanimous consent document printed record included document received administration negotiation inclusion seeking help illuminate meaning provision legislation key document change july version september version copy submitted change marked redlining marking unfortunately submitted copy version copy fax marking appear effect september version unintelligible result'

In [32]:
def preprocessing_tokens(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens_lematized = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    tokens_lematized = [token for token in tokens_lematized if token not in stop_words]
    return tokens_lematized

In [33]:
# preprocessing text takes more than hour to run, I use the saved pickle file after doing the preprocessing one time
with open("Inputs/preprocessed_tokens105.pkl", "rb") as fp:   # Unpickling
    tokens = pickle.load(fp)

In [34]:
docs_corpus = []

for i in range(len(tokens)):
    docs_corpus.append(gensim.models.doc2vec.TaggedDocument(words=tokens[i], tags=[i]))

In [35]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=40)

In [36]:
model.build_vocab(docs_corpus)

In [37]:
model.train(docs_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [38]:
ranks = []
second_ranks = []

for doc_id in range(len(docs_corpus)):
    inferred_vector = model.infer_vector(docs_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [39]:
inferred_vector = model.infer_vector(docs_corpus[6].words)
similarity_list = model.dv.most_similar([inferred_vector], topn=len(model.dv))

similarity_rank = [doc[0] for doc in similarity_list]
similarity_values = [doc[1] for doc in similarity_list]


In [40]:
similarity_list[:10]

[(6, 0.9972328543663025),
 (42, 0.4267270565032959),
 (58, 0.4207288324832916),
 (74, 0.4108615815639496),
 (15, 0.3572089374065399),
 (68, 0.35167014598846436),
 (91, 0.34036245942115784),
 (48, 0.3400339186191559),
 (34, 0.3396741449832916),
 (71, 0.3259413242340088)]

#### Validation of findings (i.e., understand if the most similar speeches are senators from the same state and/party).

In [41]:
# read given csv with info about senators
senators_df = pd.read_csv('Inputs/sen105kh_fix.csv',sep = ';')

Now we want to add a column with similarities to the *senators_df*

In [42]:
# create a column in a format "lname-stateab" for future merging 
senators_df['lname_state'] = senators_df[['lname', 'stateab']].agg('-'.join, axis=1)

In [43]:
names_by_rank = []

for i in similarity_rank:
    name = files[i][4:-4]
    names_by_rank.append(name)
    
names_by_rank[:10]

['biden-de',
 'grams-mn',
 'kerrey-ne',
 'moynihan-ny',
 'byrd-wv',
 'lugar-in',
 'specter-pa',
 'helms-nc',
 'feingold-wi',
 'mcconnell-ky']

In [44]:
# create a df with names and gensim similarities
similarity_df = pd.DataFrame(
    {'lname_state': names_by_rank, # list of names-states
     'gensim_similarity': similarity_values
    })

# join two dataframes
gensim_sim_df = pd.merge(senators_df,similarity_df,on='lname_state',how='left')

In [45]:
gensim_sim_df 

Unnamed: 0,cong,lname,stateab,lstate,id,dist,party,lname_state,gensim_similarity
0,105,sessions,al,ALABAMA,49700,0,200,sessions-al,0.091432
1,105,shelby,al,ALABAMA,94659,0,200,shelby-al,0.239985
2,105,murkowski,ak,ALASKA,14907,0,200,murkowski-ak,0.151816
3,105,stevens,ak,ALASKA,12109,0,200,stevens-ak,0.142532
4,105,kyl,az,ARIZONA,15429,0,200,kyl-az,0.285171
...,...,...,...,...,...,...,...,...,...
95,105,rockefeller,wv,WEST VI,14922,0,100,rockefeller-wv,0.084187
96,105,feingold,wi,WISCONS,49309,0,100,feingold-wi,0.339674
97,105,kohl,wi,WISCONS,15703,0,100,kohl-wi,0.239453
98,105,enzi,wy,WYOMING,49706,0,200,enzi-wy,0.042320


The next step is to clean the dataframe. There is only one change we should do: replace numbers in party with the correct name. As Biden is Democrat and his party is 100, we replace 100 with 'democratic'. 

In [46]:
gensim_sim_df['party'] = gensim_sim_df['party'].replace(100, 'democratic')
gensim_sim_df['party'] = gensim_sim_df['party'].replace(200, 'republican')
gensim_sim_df['is_democrat'] = np.where(gensim_sim_df['party'] == 'democratic', 1,0)

Sort df by similarity value:

In [47]:
gensim_sim_df = gensim_sim_df.sort_values(by=['gensim_similarity'], ascending=False).reset_index(drop=True)
gensim_sim_df = gensim_sim_df.drop(['cong', 'stateab', 'id', 'dist'], axis=1)

Now let's look on the info about top 10 speeches similar to Biden's one:

In [48]:
gensim_sim_df.head(11)

Unnamed: 0,lname,lstate,party,lname_state,gensim_similarity,is_democrat
0,biden,DELAWAR,democratic,biden-de,0.997233,1
1,grams,MINNESO,republican,grams-mn,0.426727,0
2,kerrey,NEBRASK,democratic,kerrey-ne,0.420729,1
3,moynihan,NEW YOR,democratic,moynihan-ny,0.410862,1
4,byrd,WEST VI,democratic,byrd-wv,0.357209,1
5,lugar,INDIANA,republican,lugar-in,0.35167,0
6,specter,PENNSYL,republican,specter-pa,0.340362,0
7,helms,NORTH C,republican,helms-nc,0.340034,0
8,feingold,WISCONS,democratic,feingold-wi,0.339674,1
9,mcconnell,KENTUCK,republican,mcconnell-ky,0.325941,0


- As we can see, almost all top 6 similar speeches (except the second most similar speech) were given by democrats. It makes sense since we expect politicians from the same party have similar speeches. 
- Although the second most similar speech was given by the republican politician (Byrd), Byrd was representing West Virginia state, which is very close to Biden's Delaware state. 
- Other top 3 similar speeches' states are also relatively close to Delaware (all are located in the north-western part of the country), with Pennsylvania sharing a border with it. 
- The second most similar speech being given by a republican politician also can be explained by the fact that politically Biden was known as a centrist who would work “across-the-aisle” with his Republican counterparts and therefore some alignment with centrist members of both parties could take place. However, some research shows that Robert Byrd is considered as solid democrat, not centrist.



Now let's compare these results with the outcome that we get using cosine similarity.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cos_similarity = []
names_orig_order = []

vector_biden = model.infer_vector(docs_corpus[6].words)

for i in range(100):
    inferred_vector = model.infer_vector(docs_corpus[i].words)
    
    name = files[i][4:-4]
    names_orig_order.append(name)
    
    temp_sim = cosine_similarity(vector_biden.reshape(1, -1), inferred_vector.reshape(1, -1)) 
    cos_similarity.append(temp_sim[0][0])

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].hist(similarity_values, bins=20, range=[0.0,0.5],edgecolor='black', color = "skyblue")
axes[0].set_title('gensim similarity')
axes[1].hist(cos_similarity, bins=20, range=[0.0,0.5],edgecolor='black', color = "skyblue",)
axes[1].set_title('cosine similarity')
plt.show()

In [None]:
# create a df with names and cosine similarities
similarity_df = pd.DataFrame(
    {'lname_state': names_orig_order, # list of names-states
     'cosine_similarity': cos_similarity
    })

# join gensim and cosine similary dataframes for comparison
cosine_sim_df = pd.merge(gensim_sim_df,similarity_df,on='lname_state',how='left')

In [None]:
cosine_sim_df = cosine_sim_df.sort_values(by=['cosine_similarity'], ascending=False).reset_index(drop=True)

cosine_sim_df['gensim_rank'] = cosine_sim_df['gensim_similarity'].rank(ascending=False).astype(int)-1
cosine_sim_df['cosine_rank'] = cosine_sim_df['cosine_similarity'].rank(ascending=False).astype(int)-1

cosine_sim_df.head(11)

- The results for cosine similarity analysis are very similar to what we had when we used gensim similarity. 
- Even though the similarity values are slightly different, the ranking of the most similar speeches to Biden's speech is the same for the top-3. For other ranks in top-10, even if the ranking changed the change was not significant (maximum by one position). 