# Обработка текстовых данных

In [None]:
# !pip install wget
!pip install ufal.udpipe



In [None]:
import rus_preprocessing_udpipe as UDPipe
from ufal.udpipe import Model, Pipeline

# загрузка модели UDPipe для предобработки
model2 = Model.load('/content/drive/MyDrive/EMO/model/russian-syntagrus-ud-2.0-170801.udpipe')


Loading the model...
Processing input...


In [None]:
import pandas as pd

df = pd.read_csv('descs.csv', sep=';')

In [None]:
df

Unnamed: 0,text,res_id
0,Tin Ore is a T3 resource. Harvesting this reso...,ORE_T3
1,Iron Ore is a T4 resource. Harvesting this res...,ORE_T4
2,Titanium Ore is a Tier 5 resource. Harvesting ...,ORE_T5
3,Chestnut Logs are a Tier 3 resource. They may ...,WOOD_T3
4,Pine Logs are a Tier 4 resource. They may be f...,WOOD_T4
5,Cedar Logs are a Tier 5 resource. They may be ...,WOOD_T5
6,yz near fort sterling like cairn cloatch or pe...,ORE_T5
7,for t5 ore i'd go farm somewhere around fort s...,ORE_T5
8,"GO FARM T4 ORES AT PEN GARN OR PEN GENT, STOP ...",ORE_T4
9,Avalonian roads. You gonna be an ore slave if ...,ORE_T5


In [None]:
from nltk.tokenize import RegexpTokenizer
from pymystem3 import Mystem
from nltk.corpus import stopwords

m = Mystem()

# Токенизация + лемматизация
df['token'] = df['text'].apply(lambda x: [word.lower() for word in RegexpTokenizer(r'[A-z]+').tokenize(x)])
df['lemma'] = df['token'].apply(lambda x: [word for word in m.lemmatize(' '.join(x)) if word.strip()])

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords

# Удаление стоп-слов
df['lemma_cor'] = df['lemma'].apply(lambda x: [word for word in x \
                                               if word not in stopwords.words('english')])

In [None]:
# извлечение морфологических признаков
df['morph'] = df['lemma_cor'].apply(lambda x: nltk.pos_tag([word for word in x]))

In [None]:
df

Unnamed: 0,text,res_id,token,lemma,lemma_cor,morph
0,Tin Ore is a T3 resource. Harvesting this reso...,ORE_T3,"[tin, ore, is, a, t, resource, harvesting, thi...","[tin, ore, is, a, t, resource, harvesting, thi...","[tin, ore, resource, harvesting, resource, adv...","[(tin, NN), (ore, RB), (resource, NN), (harves..."
1,Iron Ore is a T4 resource. Harvesting this res...,ORE_T4,"[iron, ore, is, a, t, resource, harvesting, th...","[iron, ore, is, a, t, resource, harvesting, th...","[iron, ore, resource, harvesting, resource, ad...","[(iron, NN), (ore, RB), (resource, JJ), (harve..."
2,Titanium Ore is a Tier 5 resource. Harvesting ...,ORE_T5,"[titanium, ore, is, a, tier, resource, harvest...","[titanium, ore, is, a, tier, resource, harvest...","[titanium, ore, tier, resource, harvesting, re...","[(titanium, NN), (ore, RB), (tier, JJ), (resou..."
3,Chestnut Logs are a Tier 3 resource. They may ...,WOOD_T3,"[chestnut, logs, are, a, tier, resource, they,...","[chestnut, logs, are, a, tier, resource, they,...","[chestnut, logs, tier, resource, may, found, f...","[(chestnut, NN), (logs, NNS), (tier, JJR), (re..."
4,Pine Logs are a Tier 4 resource. They may be f...,WOOD_T4,"[pine, logs, are, a, tier, resource, they, may...","[pine, logs, are, a, tier, resource, they, may...","[pine, logs, tier, resource, may, found, follo...","[(pine, JJ), (logs, NNS), (tier, IN), (resourc..."
5,Cedar Logs are a Tier 5 resource. They may be ...,WOOD_T5,"[cedar, logs, are, a, tier, resource, they, ma...","[cedar, logs, are, a, tier, resource, they, ma...","[cedar, logs, tier, resource, may, found, foll...","[(cedar, NN), (logs, NNS), (tier, JJR), (resou..."
6,yz near fort sterling like cairn cloatch or pe...,ORE_T5,"[yz, near, fort, sterling, like, cairn, cloatc...","[yz, near, fort, sterling, like, cairn, cloatc...","[yz, near, fort, sterling, like, cairn, cloatc...","[(yz, NN), (near, IN), (fort, NN), (sterling, ..."
7,for t5 ore i'd go farm somewhere around fort s...,ORE_T5,"[for, t, ore, i, d, go, farm, somewhere, aroun...","[for, t, ore, i, d, go, farm, somewhere, aroun...","[ore, go, farm, somewhere, around, fort, sterl...","[(ore, RB), (go, VBP), (farm, NN), (somewhere,..."
8,"GO FARM T4 ORES AT PEN GARN OR PEN GENT, STOP ...",ORE_T4,"[go, farm, t, ores, at, pen, garn, or, pen, ge...","[go, farm, t, ores, at, pen, garn, or, pen, ge...","[go, farm, ores, pen, garn, pen, gent, stop, s...","[(go, VB), (farm, NN), (ores, NNS), (pen, VBP)..."
9,Avalonian roads. You gonna be an ore slave if ...,ORE_T5,"[avalonian, roads, you, gonna, be, an, ore, sl...","[avalonian, roads, you, gonna, be, an, ore, sl...","[avalonian, roads, gonna, ore, slave, went, on...","[(avalonian, JJ), (roads, NNS), (gonna, VBP), ..."


In [None]:
from gensim.models import fasttext

# обучение модели на лемматизированных текстах корпуса
model1 = fasttext.FastText(sentences=df['lemma_cor'], vector_size=100, window=5, min_count=1, workers=4)

In [None]:
# векторизация слов и усреднение полученных векторов для каждого сообщения
df['fasttext1'] = df['lemma_cor'].apply(lambda x: \
                                        np.sum([model1.wv[word] for word in x], axis=0)/len(x))

NameError: name 'np' is not defined

In [None]:
# предобработка текстовых данных
process_pipeline = Pipeline(model2, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
#df['udpipe'] = df['text'].apply(lambda x: UDPipe.process(process_pipeline, text=x))

In [None]:
df['udpipe'] = df['text'].apply(lambda x: UDPipe.process(process_pipeline, text=x))