<a href="https://colab.research.google.com/github/ekolonsky/RIA_news/blob/main/RIA_Novosti_EDA_n_grams_freq_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Frequency analysis with n-grams

In [2]:
# install packages for NLP
!pip install pymorphy2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import requests, re
from collections import Counter

In [4]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer(lang='ru')
# пример - морфологический разбора слова "идём" 
print(morph.parse('идём'))
normal = {}

[Parse(word='идём', tag=OpencorporaTag('VERB,impf,intr plur,1per,pres,indc'), normal_form='идти', score=0.5, methods_stack=((DictionaryAnalyzer(), 'идём', 1696, 2),)), Parse(word='идём', tag=OpencorporaTag('VERB,impf,intr sing,impr,incl'), normal_form='идти', score=0.5, methods_stack=((DictionaryAnalyzer(), 'идём', 1696, 11),))]


In [5]:
import cloudpickle as cp
from urllib.request import urlopen

url = 'https://github.com/ekolonsky/RIA_news/raw/main/'

normal = cp.load(urlopen(url + "normal.pkl")) 

print(len(normal), normal['москве'])

415973 москва


In [6]:
  # Функция для разбиений на токены

separators = ';', ',', '"', ':', '.','!', '?', ' ', '`', '%','$','*','(',')','—'
regex_punctuation = '|'.join(map(re.escape, separators))
url = 'https://github.com/ekolonsky/RIA_news/raw/main/'


def get_stopwords():
    req = requests.get(url + 'stopwords.txt')
    ans = req.text.split()
    return ans
stopwords = get_stopwords() # затем сделаем загрузку из словаря стоп-слов


def normalize(word):
    word = word.lower()
    if word not in normal:
      normal[word] = morph.parse(word)[0].normal_form
    return normal[word]

def tokenize(text):
    text = text.lower()
    tokens = [normalize(word) for word in re.split(regex_punctuation,text)  
      if not word.isnumeric()
      and word != ''] 
    return [w for w in tokens if w not in stopwords] 

def generate_N_grams(tokens,ngram=1):
    temp=zip(*[tokens[i:] for i in range(0,ngram)])
    ans=['_'.join(t) for t in temp]
    return ans

# пример
generate_N_grams(tokenize('В Москве назвали победителей Гран-при 15 Московского кинофестиваля "Минотавр".'),3)

['москва_назвать_победитель',
 'назвать_победитель_гран-при',
 'победитель_гран-при_московский',
 'гран-при_московский_кинофестиваль',
 'московский_кинофестиваль_минотавр']

In [7]:
# функция для извлечения даты новости из строки со ссылкой на новость
def get_date(link):
  pattern = r'\/20[0-2][0-9][0-1][0-9][0-3][0-9]\/'
  result = re.search(pattern, link)

  if result:
    return result[0][1:-1]
  else:
    return ''
  return 

# пример
get_date('https://ria.ru/20041229/774359.html	Жерар Депардье открыл в Париже второй ресторан')

'20041229'

## Read and count n-grams

In [64]:
%%time
#url = 'https://raw.githubusercontent.com/ekolonsky/RIA_news/main/'
url = 'https://github.com/ekolonsky/RIA_news/raw/main/'
filenames = ['ria-{:02d}-1.txt'.format(i) for i in range(2,23)] 
filenames += ['ria-{:02d}-2.txt'.format(i) for i in range(2,22)]
#filenames = ['ria-{:02d}-1.txt'.format(i) for i in range(13,15)]  # quick test

ngrams = [Counter(), Counter()]
corpus_cnt = Counter()
properties = {}

YEAR = '2014'

for filename in filenames:
  print(filename)
  req = requests.get(url + filename)
  for line in req.text.splitlines():
    sep = line.find('\t')
    if sep == -1:
      continue
    link, news = line[:sep],line[sep+1:]
    tokens = tokenize(news)
    
    #in_whitelist =  any(word in whitelist for word in tokens)  # filter news by words in whitelist

    date = get_date(link)
    year = date[:4]
    flag = 0 if year < YEAR else 1
    for n in [2, 3, 4]:
      for ngram in generate_N_grams(tokens,n):
          ngrams[flag][ngram] +=1
          corpus_cnt[(n, flag)] += 1
          if ngram not in properties:
            properties[ngram] = n



ria-02-1.txt
ria-03-1.txt
ria-04-1.txt
ria-05-1.txt
ria-06-1.txt
ria-07-1.txt
ria-08-1.txt
ria-09-1.txt
ria-10-1.txt
ria-11-1.txt
ria-12-1.txt
ria-13-1.txt
ria-14-1.txt
ria-15-1.txt
ria-16-1.txt
ria-17-1.txt
ria-18-1.txt
ria-19-1.txt
ria-20-1.txt
ria-21-1.txt
ria-22-1.txt
ria-02-2.txt
ria-03-2.txt
ria-04-2.txt
ria-05-2.txt
ria-06-2.txt
ria-07-2.txt
ria-08-2.txt
ria-09-2.txt
ria-10-2.txt
ria-11-2.txt
ria-12-2.txt
ria-13-2.txt
ria-14-2.txt
ria-15-2.txt
ria-16-2.txt
ria-17-2.txt
ria-18-2.txt
ria-19-2.txt
ria-20-2.txt
ria-21-2.txt
CPU times: user 4min 36s, sys: 10.3 s, total: 4min 47s
Wall time: 6min 47s


## Sort by rank R
Rank $ R = (F_2 - F_1) * \log N,$ where $F_1$ - frequency before 2014 year, $F_2$ - frequency after 2014. 

In [66]:
# print out corpus sum per year and per n

corpus_cnt

Counter({(2, 0): 9248778,
         (2, 1): 8381357,
         (3, 0): 7697234,
         (3, 1): 6924419,
         (4, 0): 6151429,
         (4, 1): 5471703})

In [67]:
from math import log

with open('nrgams-freq.txt', mode='w', encoding='utf-8') as file:
  for ngram in set(ngrams[0]) & set(ngrams[1]):
    n = properties[ngram]
    N1 = corpus_cnt[(n, True)]
    N2 = corpus_cnt[(n, False)]
    C1 = ngrams[0][ngram]
    C2 = ngrams[1][ngram]
    N = N1 + N2
    F1 = C1 / N1
    F2 = C2 / N2
    R = (F2 - F1) * log(N)

    if C1 + C2 > 100: # skip ngrams less then treshold
      line = '{0};{1};{2};{3};{4};{5};{6}\n'.format(n, ngram, C1, C2, F1, F2, R)
      file.write(line)


In [68]:
import pandas as pd

df = pd.read_csv('nrgams-freq.txt', 
                 delimiter=';',
                 names=['n','ngram','C1','C2','F1', 'F2','R'])
df['C'] = df.C1 + df.C2

In [69]:
# top 10 2-grams
df[df.n==2].sort_values('C',ascending=False ).head(10)

Unnamed: 0,n,ngram,C1,C2,F1,F2,R,C
2281,2,человек_погибнуть,10669,6188,0.001273,0.000669,-0.010076,16857
12936,2,глава_мид,7656,8238,0.000913,0.000891,-0.000379,15894
12951,2,погибнуть_человек,5937,4827,0.000708,0.000522,-0.003111,10764
8762,2,принять_участие,6110,3715,0.000729,0.000402,-0.005461,9825
14007,2,мид_рф,6064,3549,0.000724,0.000384,-0.005669,9613
14028,2,сборная_россия,5934,1833,0.000708,0.000198,-0.008506,7767
1660,2,число_жертва,3114,3750,0.000372,0.000405,0.000566,6864
11115,2,цена_нефть,3504,3101,0.000418,0.000335,-0.001381,6605
10217,2,федеральный_округ,6526,57,0.000779,6e-06,-0.012889,6583
15396,2,теннисный_турнир,5770,755,0.000688,8.2e-05,-0.010125,6525


In [70]:
# top 10 2-grams decreasing weighted frequency
df[df.n==2].sort_values('R',ascending=True ).head(10)

Unnamed: 0,n,ngram,C1,C2,F1,F2,R,C
10217,2,федеральный_округ,6526,57,0.000779,6.162976e-06,-0.012889,6583
15396,2,теннисный_турнир,5770,755,0.000688,8.163241e-05,-0.010125,6525
2281,2,человек_погибнуть,10669,6188,0.001273,0.0006690614,-0.010076,16857
6510,2,владимир_путин,5304,357,0.000633,3.859969e-05,-0.009915,5661
14028,2,сборная_россия,5934,1833,0.000708,0.0001981883,-0.008506,7767
5740,2,планировать_освещать,4741,1051,0.000566,0.0001136366,-0.007542,5792
12711,2,-_эксперт,3686,8,0.00044,8.649791e-07,-0.007323,3694
15507,2,событие_который,4562,1054,0.000544,0.000113961,-0.00718,5616
14007,2,мид_рф,6064,3549,0.000724,0.0003837264,-0.005669,9613
8762,2,принять_участие,6110,3715,0.000729,0.0004016747,-0.005461,9825


In [76]:
# top 10 2-grams increasing weighted frequency
df[df.n==2].sort_values('R',ascending=False ).head(10)

Unnamed: 0,n,ngram,C1,C2,F1,F2,R,C
7674,2,эксперт_рассказать,125,3306,1.491405e-05,0.000357,0.005715,3431
13476,2,новый_случай,258,3158,3.07826e-05,0.000341,0.005184,3416
682,2,северный_поток,144,2982,1.718099e-05,0.000322,0.005093,3126
5327,2,санкция_россия,16,2746,1.908999e-06,0.000297,0.004922,2762
13343,2,сообщить_сми,14,2654,1.670374e-06,0.000287,0.00476,2668
16180,2,заражение_коронавирус,3,2640,3.579373e-07,0.000285,0.004757,2643
17308,2,случай_заражение,146,2771,1.741961e-05,0.0003,0.004708,2917
6764,2,украинский_силовик,6,2593,7.158745e-07,0.00028,0.004666,2599
12454,2,рассказать_какой,52,2351,6.204246e-06,0.000254,0.004138,2403
5130,2,объяснить_почему,197,2430,2.350455e-05,0.000263,0.003992,2627


In [77]:
# top 10 3-grams
df[df.n==3].sort_values('C',ascending=False ).head(10)

Unnamed: 0,n,ngram,C1,C2,F1,F2,R,C
10311,3,землетрясение_магнитуда_произойти,2085,701,0.0003011083,9.1e-05,-0.003465,2786
7936,3,рынок_акция_рф,1954,817,0.0002821897,0.000106,-0.002904,2771
14729,3,возбудить_уголовный_дело,1796,926,0.0002593719,0.00012,-0.002294,2722
2330,3,официальный_курс_евро,611,1691,8.823845e-05,0.00022,0.002169,2302
14244,3,случай_заражение_коронавирус,1,2300,1.444164e-07,0.000299,0.004927,2301
6445,3,новость_планировать_освещать,1183,959,0.0001708447,0.000125,-0.000763,2142
14318,3,выявить_новый_случай,19,1980,2.743913e-06,0.000257,0.004199,1999
7852,3,событие_который_новость,998,960,0.0001441276,0.000125,-0.00032,1958
9246,3,который_новость_планировать,996,960,0.0001438388,0.000125,-0.000315,1956
126,3,событие_который_редакция,1851,90,0.0002673148,1.2e-05,-0.004217,1941


In [79]:
# top 10 3-grams decreasing weighted frequency
df[df.n==3].sort_values('R',ascending=True ).head(10)

Unnamed: 0,n,ngram,C1,C2,F1,F2,R,C
126,3,событие_который_редакция,1851,90,0.000267,1.169251e-05,-0.004217,1941
6645,3,который_планировать_освещать,1705,2,0.000246,2.598336e-07,-0.004058,1707
17715,3,событие_который_планировать,1694,2,0.000245,2.598336e-07,-0.004032,1696
7754,3,министр_иностранный_дело,1712,129,0.000247,1.675927e-05,-0.003802,1841
10311,3,землетрясение_магнитуда_произойти,2085,701,0.000301,9.107168e-05,-0.003465,2786
5341,3,сибирский_федеральный_округ,1312,3,0.000189,3.897504e-07,-0.00312,1315
3490,3,дальневосточный_федеральный_округ,1307,5,0.000189,6.49584e-07,-0.003103,1312
17323,3,глава_мид_рф,1500,231,0.000217,3.001078e-05,-0.003079,1731
1923,3,уральский_федеральный_округ,1246,6,0.00018,7.795008e-07,-0.002956,1252
7936,3,рынок_акция_рф,1954,817,0.000282,0.000106142,-0.002904,2771


In [80]:
# top 10 3-grams increasing weighted frequency
df[df.n==3].sort_values('R',ascending=False ).head(10)

Unnamed: 0,n,ngram,C1,C2,F1,F2,R,C
14244,3,случай_заражение_коронавирус,1,2300,1.444164e-07,0.000299,0.004927,2301
14318,3,выявить_новый_случай,19,1980,2.743913e-06,0.000257,0.004199,1999
17221,3,новый_случай_заражение,30,1498,4.332493e-06,0.000195,0.003139,1528
2363,3,число_жертва_коронавирус,1,1095,1.444164e-07,0.000142,0.002345,1096
2330,3,официальный_курс_евро,611,1691,8.823845e-05,0.00022,0.002169,2302
4389,3,произойти_землетрясение_магнитуда,360,1266,5.198992e-05,0.000164,0.001856,1626
5648,3,новый_случай_коронавирус,1,747,1.444164e-07,9.7e-05,0.001599,748
5395,3,ск_возбудить_дело,127,792,1.834089e-05,0.000103,0.001395,919
15426,3,назвать_возможный_причина,34,654,4.910159e-06,8.5e-05,0.001321,688
1369,3,получить_год_колония,208,840,3.003862e-05,0.000109,0.001305,1048


In [78]:
# top 10 4-grams
df[df.n==4].sort_values('C',ascending=False ).head(20)

Unnamed: 0,n,ngram,C1,C2,F1,F2,R,C
1880,4,событие_который_новость_планировать,996,960,0.0001820274,0.0001560613,-0.000422,1956
8593,4,который_новость_планировать_освещать,996,959,0.0001820274,0.0001558987,-0.000425,1955
4535,4,событие_который_планировать_освещать,1692,2,0.0003092273,3.251277e-07,-0.005025,1694
6097,4,выявить_новый_случай_заражение,1,1111,1.827585e-07,0.0001806084,0.002935,1112
7307,4,событие_который_редакция_москва,1014,39,0.0001853171,6.33999e-06,-0.002912,1053
11731,4,который_редакция_москва_планировать,1009,39,0.0001844033,6.33999e-06,-0.002897,1048
13640,4,редакция_москва_планировать_освещать,1009,39,0.0001844033,6.33999e-06,-0.002897,1048
8551,4,событие_который_редакция_культура,831,51,0.0001518723,8.290757e-06,-0.002336,882
11268,4,редакция_культура_планировать_освещать,829,51,0.0001515068,8.290757e-06,-0.00233,880
1308,4,который_редакция_культура_планировать,829,51,0.0001515068,8.290757e-06,-0.00233,880
