In [1]:
import os
import sqlite3
import pandas as pd
import re
import numpy as np
from collections import Counter

### Объявляем вспомогательные функции для соединения с базой

In [2]:
def create_connection(path):
    connection = None
    try:
        if os.path.exists(path):
            os.remove(path)
        connection = sqlite3.connect(path)
        print("Connection to SQLite DB successful")
    except sqlite3.Error as e:
        print(f"The error '{e}' occurred")

    return connection

In [3]:
def execute_query(connection, query):
    cursor = connection.cursor()
    try:
        cursor.execute(query)
        connection.commit()
        print("Query executed successfully")
    except Error as e:
        print(f"The error '{e}' occurred")

## Шаг 1

### Загружаем csv-файлы в дата-фреймы и очищаем их

In [4]:
def remove_non_latin_symbols(text):
    return re.sub(r"[^a-zA-Z\s]", "", text)

In [5]:
stopwords_df = pd.read_csv("stopwords.csv", header=None, names=['word'])
stopwords_df["word"] = stopwords_df["word"].str.lower().apply(remove_non_latin_symbols)

In [6]:
stopwords = set(stopwords_df['word'].values)

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(filtered_words)

In [7]:
articles_df = pd.read_csv("articles.csv", sep=";", header=None, names=['id', 'text'])
articles_df["text"] = articles_df["text"].str.lower().apply(remove_non_latin_symbols).apply(remove_stopwords)

### Устанавливаем соединение с базой, создаем таблицы, и загружаем в них очищенные датафреймы

In [8]:
connection = create_connection("articles.db")

Connection to SQLite DB successful


In [9]:
create_articles_table = """
    CREATE TABLE IF NOT EXISTS articles (
        id INTEGER,
        text TEXT
    );
"""

In [10]:
execute_query(connection, create_articles_table)  

Query executed successfully


In [11]:
articles_df.to_sql("articles", connection, if_exists="append", index=False)
pd.read_sql(f"SELECT * FROM articles", connection)

Unnamed: 0,id,text
0,1,bradley charles cooper born january american a...
1,2,cooper enrolled mfa program actors studio begi...
2,3,cooper found greater success romantic comedy s...
3,4,labeled sex symbol media cooper named people m...
4,5,cooper born january abington township near phi...
...,...,...
355,356,roosevelt won reelection lower percentage elec...
356,357,last election new hampshire oregon voted democ...
357,358,elections eventual winner carry ohio elections
358,359,electoral votes received roosevelt added elect...


In [12]:
create_stopwords_table = """
    CREATE TABLE IF NOT EXISTS stopwords (
        word TEXT
    );
"""

In [13]:
execute_query(connection, create_stopwords_table)  

Query executed successfully


In [14]:
stopwords_df.to_sql("stopwords", connection, if_exists="append", index=False)
pd.read_sql(f"SELECT * FROM stopwords", connection)

Unnamed: 0,word
0,x
1,y
2,your
3,yours
4,yourself
...,...
728,excepts
729,except
730,excepting
731,exes


## Шаг 2

Для каждой комбинации слов посчитать NPMI и вывести на экран TOP-50 самых популярных коллокаций, отсортированных по убыванию значения NPMI.\
Комбинацию слов объединить пробелом.

### Вариант №1: используя Pandas

In [15]:
# Создадим list с биграммами
bigrams = []
for text in articles_df["text"].tolist():
    words = text.split()
    bigrams.extend([(words[i], words[i+1]) for i in range(len(words)-1)])

# Создаем Counter'ы для списка слов и списка биграмм
word_counts = Counter(sum([text.split() for text in articles_df["text"].tolist()], [])) #Примечание: sum(list, []) - это просто превращение списка списков в "плоский" список
bigram_counts = Counter(bigrams)

# Расчитываем NPMI
npmi_scores = {}
for bigram in bigram_counts:
    w1, w2 = bigram
    p_w1w2 = bigram_counts[bigram]/sum(bigram_counts.values())
    p_w1 = word_counts[w1]/sum(word_counts.values())
    p_w2 = word_counts[w2]/sum(word_counts.values())
    npmi = np.log(p_w1w2 / (p_w1 * p_w2)) / -np.log(p_w1w2)
    npmi_scores[bigram] = npmi

# Сохраняем NPMI scores и bigrams в pandas dataframe
npmi_df = pd.DataFrame(list(npmi_scores.items()), columns=["bigram", "npmi_score"])
npmi_df["bigram"] = [' '.join(bigram) for bigram in npmi_df["bigram"]]

# Выводим сортированный по NPMI топ-50
npmi_df = npmi_df.sort_values(by=['npmi_score'], ascending=False)
npmi_df.head(50)


Unnamed: 0,bigram,npmi_score
4211,fish fry,1.005224
191,nightmare alley,1.005107
118,linings playbook,1.005107
533,los angeles,1.005107
1448,guardians galaxy,1.004971
5528,willy wanker,1.004805
11710,iberian peninsula,1.004805
2015,licorice pizza,1.004805
2328,barack obama,1.004805
3804,ella fitzgerald,1.00459


### Вариант №2: используя SQL

In [16]:
top_50_by_npmi_query = f"""
    WITH RECURSIVE split_words(id, word, pos, text) AS (
        SELECT 
            id,
            substr(text || ' ', 1, instr(text || ' ', ' ')),
            instr(text || ' ', ' '),
            text
        FROM 
            articles
        UNION ALL
        SELECT 
            id,
            substr(text || ' ', pos, instr(substr(text || ' ', pos) || ' ', ' ')),
            instr(substr(text || ' ', pos) || ' ', ' ') + pos,
            text
        FROM 
            split_words
        WHERE 
            pos < length(text || ' ')
    ),
    bigrams AS (
        SELECT 
            id, 
            word || ' ' || lead(word) OVER (PARTITION BY id ORDER BY pos) AS bigram
        FROM 
            split_words
        WHERE 
            TRIM(word) != ''
    )
    SELECT
        bigram,
        w1w2_count,
        total_bigrams_count,
        w1_count,
        w2_count,
        total_words_count,
        p_w1w2,
        p_w1,
        p_w2,
        CAST(log(p_w1w2 / (p_w1 * p_w2)) / -log(p_w1w2) AS REAL) AS npmi
    FROM
        (
            SELECT
                *,
                CAST(w1w2_count AS REAL)/CAST(total_bigrams_count AS REAL) AS p_w1w2,
                CAST(w1_count AS REAL)/CAST(total_words_count AS REAL) AS p_w1,
                CAST(w2_count AS REAL)/CAST(total_words_count AS REAL) AS p_w2
            FROM
            (
                SELECT
                    bigram,
                    count(*) AS w1w2_count,
                    (SELECT count(*) FROM bigrams WHERE bigram IS NOT NULL) AS total_bigrams_count,
                    (SELECT count(*) FROM split_words WHERE b.bigram LIKE word || ' %') AS w1_count,
                    (SELECT count(*) FROM split_words WHERE b.bigram LIKE '% ' || word) AS w2_count,
                    (SELECT count(*) FROM split_words WHERE TRIM(word) != '') AS total_words_count
                FROM 
                    bigrams b
                WHERE 
                    b.bigram IS NOT NULL
                GROUP BY 
                    b.bigram
            )
        )
    ORDER BY 
        10 DESC
    LIMIT 50
    ;
"""

In [17]:
pd.read_sql(top_50_by_npmi_query, connection)

Unnamed: 0,bigram,w1w2_count,total_bigrams_count,w1_count,w2_count,total_words_count,p_w1w2,p_w1,p_w2,npmi
0,fish fry,6,17141,6,6,17501,0.00035,0.000343,0.000343,1.005224
1,linings playbook,5,17141,5,5,17501,0.000292,0.000286,0.000286,1.005107
2,los angeles,5,17141,5,5,17501,0.000292,0.000286,0.000286,1.005107
3,nightmare alley,5,17141,5,5,17501,0.000292,0.000286,0.000286,1.005107
4,guardians galaxy,4,17141,4,4,17501,0.000233,0.000229,0.000229,1.004971
5,barack obama,3,17141,3,3,17501,0.000175,0.000171,0.000171,1.004805
6,iberian peninsula,3,17141,3,3,17501,0.000175,0.000171,0.000171,1.004805
7,licorice pizza,3,17141,3,3,17501,0.000175,0.000171,0.000171,1.004805
8,willy wanker,3,17141,3,3,17501,0.000175,0.000171,0.000171,1.004805
9,allison rader,2,17141,2,2,17501,0.000117,0.000114,0.000114,1.00459


### Закрываем соединение

In [18]:
connection.close()