In [1]:
!pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2023.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m77.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.7.0 fastparquet-2023.8.0


In [2]:
!python -m spacy download xx_ent_wiki_sm

Collecting xx-ent-wiki-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.6.0/xx_ent_wiki_sm-3.6.0-py3-none-any.whl (11.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m86.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Installing collected packages: xx-ent-wiki-sm
Successfully installed xx-ent-wiki-sm-3.6.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_ent_wiki_sm')


In [3]:
import os
import string
from typing import List
import warnings
import numpy as np
import pandas as pd
from itertools import chain
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import nltk
import spacy
import re

from wordcloud import WordCloud, STOPWORDS

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier

import torch
from torch.utils.data import DataLoader, random_split, Dataset
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

%matplotlib inline

In [4]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    print(dirname)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input
/kaggle/input/mantis-analytics-location-detection


In [5]:
# Preferably for proccessing the datasets we need only cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'device {device}')

device cpu


In [6]:
project_path = '/kaggle/input/mantis-analytics-location-detection/'

ru_geo_path = project_path + 'ru_geo_dataset.csv'
uk_geo_path = project_path + 'uk_geo_dataset.csv'
test_path = project_path + 'test.csv'

In [7]:
uk_geo_dataset = pd.read_csv(uk_geo_path, converters={"loc_markers": eval})

geo_dataset = uk_geo_dataset

In [8]:
geo_dataset

Unnamed: 0,text,loc_markers,org_markers,per_markers,is_valid
0,"Чим довше мають скачки тиску гіпертензією, тим...",[],[],[],0
1,"А поки що починали цвісти троянди, випускники ...",[],[],"[(114, 131)]",0
2,"Крім того, в 2020 р. багато експертів прогнозу...",[],[],[],0
3,"Сильно сумніваюся, що ви зупините свій вибір н...",[],[],[],0
4,Цей унікальний правовий важіль утворено 1998 р...,[],[],[],0
...,...,...,...,...,...
1009995,"Траєкторія польоту цих літаків є провокуючою”,...",[],[],"[(56, 63)]",1
1009996,Якщо порівняти у відсотковому співвідношенні к...,[],[],[],1
1009997,"У інфікувалися четверо жінок, наймолодшій із н...",[],[],[],1
1009998,"Хочу запевнити, що створення Православної Церк...",[],"[(29, 56)]",[],1


In [9]:
nlp = spacy.load("xx_ent_wiki_sm", disable=["tagger", "parser", "ner", "textcat"])

In [10]:
# A bit of an update of a code provided in the lecture.

# This modified code generates BIO labels for location markers. 
# It processes the text and location markers to determine wheach token is a part of a location entity (Begin or Inside) or not (Outside). 
# We will then store the labels in a seperate column for further modelling.

def batch_bio_labeling(texts, loc_markers_list, tokenizer, batch_size=128, n_process=16, verbose=False):
    if verbose:
        print("Applying Tokenizer")
    docs = list(tokenizer.pipe(texts, batch_size=batch_size, n_process=n_process))
    batch_results = []

    if verbose:
        print("Extracting BIO labels")
    for doc, loc_markers in zip(docs, loc_markers_list):
        tokens = [token.text for token in doc]
        labels = ['O'] * len(tokens)
        
        for start, end in loc_markers:
            # Find the token indexes that correspond to the entity's start and end positions
            start_idx = max(0, next((i for i, token in enumerate(doc) if token.idx >= start), -1))
            end_idx = min(len(doc), next((i for i, token in enumerate(doc) if token.idx + len(token.text) >= end), 999))

            if start_idx is not None and end_idx is not None:
                # Mark the first token as B-LOC (beginning) and the rest as I-LOC (inside)
                labels[start_idx] = 'B-LOC'
                for i in range(start_idx + 1, end_idx):
                    labels[i] = 'I-LOC'

        batch_results.append((tokens, labels))
                
    return batch_results


In [11]:
geo_processed_dataset = batch_bio_labeling(geo_dataset.text.to_list(), geo_dataset.loc_markers.to_list(), nlp, verbose=True)

geo_processed_df = pd.DataFrame({
    "tokens": [el[0] for el in geo_processed_dataset],
    "labels": [el[1] for el in geo_processed_dataset],
    "is_valid": geo_dataset["is_valid"].to_list()
})

# Let's save processed dataset if we need it in further experiments or just for back-up :)
geo_processed_df.to_parquet(
    'uk_geo_processed.parquet', 
    engine='fastparquet'
)

Applying Tokenizer
Extracting BIO labels


In [7]:
ru_geo_dataset = pd.read_csv(ru_geo_path, converters={"loc_markers": eval})

geo_dataset = ru_geo_dataset

KeyboardInterrupt: 

In [None]:
geo_dataset

In [None]:
import pandas as pd

# For some reason kaggle didn't really like me trying to procces the whole ru dataset
# (it was killing the kernel every time, 
# saying that I had been reaching some limit of data storing in kaggle notebook or whatever)
# So I had to break the ru dataset in 9 smaller parts
# This allows us to have some back-up data in case 

num_parts = 9
part_size = len(geo_dataset) // num_parts

# Initialize an empty DataFrame to store the combined data
combined_df = pd.DataFrame()

for part in range(num_parts):
    start_idx = part * part_size
    end_idx = (part + 1) * part_size if part < num_parts - 1 else len(geo_dataset)
    
    # Extract the subset of your dataset for the current part
    geo_subset = geo_dataset[start_idx:end_idx]

    print(f'part_{part+1}')
    print(len(geo_subset))
    geo_processed_dataset = batch_bio_labeling(geo_subset.text.to_list(), geo_subset.loc_markers.to_list(), nlp, verbose=True)
    
    # Convert the processed dataset to a DataFrame
    geo_processed_df = pd.DataFrame({
        "tokens": [el[0] for el in geo_processed_dataset],
        "labels": [el[1] for el in geo_processed_dataset],
        "doc_id": geo_subset["doc_id"].to_list(),
        "sent_id": geo_subset["sent_id"].to_list(),
    })
    
    # Save the processed dataset as a parquet file
    filename = f'ru_geo_dataset_BIO_labeled_part{part + 1}.parquet'
    geo_processed_df.to_parquet(filename, engine='fastparquet')
    print(f'Saved {filename}')
    
    combined_df = pd.concat([combined_df, geo_processed_df])


# Save the merged dataframe as a parquet file
combined_df.to_parquet('ru_geo_processed.parquet', engine='fastparquet')
print('Saved ru_geo_processed.parquet')

part_7
892093
Applying Tokenizer
Extracting BIO labels
Saved ru_geo_dataset_BIO_labeled_part7.parquet
part_8
892093
