In [2]:
import sys

from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from sparknlp.pretrained import ResourceDownloader

spark = sparknlp.start()

print("Spark NLP version: ",sparknlp.version())
print("Spark version: ", spark.version)

Spark NLP version:  3.0.3
Spark version:  3.0.2


In [3]:
import pandas as pd
predmety = pd.read_csv(r'C:\Users\zbyne\Documents\čokoláda\VŠE\datový projekt\Textová analytika - Projekt\BaseCleanUp\predmety_kit.csv')
df_exportedPandas = pd.read_csv(r'C:\Users\zbyne\Documents\čokoláda\VŠE\datový projekt\Textová analytika - Projekt\BaseCleanUp\Exported.csv')
df_exportedPandas = df_exportedPandas.iloc[:,[1,3,4,5,6]]
df_exportedPandas.head()

Unnamed: 0,ID,OTAZKA,PEDAGOG,HODNOTA_CLOB_OCISTENO,SENTIMENT
0,933,4,,pravděpodobně odstranění připouštěcího testu k...,Negative
1,940,3,,-nepovinná docházka s ústní zkouškou na konci,Positive
2,200,4,,"Minimální překrytí s agilním vývojem, byl to s...",Negative
3,206,3,,"Fajn pro lidi co chtějí umět jazyk react, host...",Positive
4,2344,4,,"lepší koncepce cvičení, občas mi přišlo, že se...",Negative


In [4]:
prdmt = list(predmety.iloc[:,2])

In [5]:
import numpy as np

spark.conf.set("spark.sql.execution.arrow.enabled", "true")
#create sparkdf from pandas dataframe
data = spark.createDataFrame(df_exportedPandas)


In [6]:
data.select("*").show()

+-----+------+-------+---------------------+---------+
|   ID|OTAZKA|PEDAGOG|HODNOTA_CLOB_OCISTENO|SENTIMENT|
+-----+------+-------+---------------------+---------+
|  933|     4|   null| pravděpodobně ods...| Negative|
|  940|     3|   null| -nepovinná docház...| Positive|
|  200|     4|   null| Minimální překryt...| Negative|
|  206|     3|   null| Fajn pro lidi co ...| Positive|
| 2344|     4|   null| lepší koncepce cv...| Negative|
| 2345|     3|   null| přednášky externi...| Positive|
| 1588|     4|   null| udělat místo před...| Negative|
| 3576|     4|   null|     probrat ty makra| Negative|
| 3582|     3|   null| naučíte se potřeb...| Positive|
| 3608|     4|   null| bohužel velmi nep...| Negative|
| 3612|     3|   null| na přednáškách ho...| Positive|
| 5180|     3|   null|            praktické| Positive|
| 6048|    23|   null| Na problematiku d...|  Neutral|
| 9986|    23|   null| programování v Ja...|  Neutral|
|10566|     3|   null| Vyzkoušeli jsme s...| Positive|
|10632|   

In [7]:
document = DocumentAssembler() \
    .setInputCol('HODNOTA_CLOB_OCISTENO') \
    .setOutputCol('document') \
    .setCleanupMode("shrink")

In [8]:
language_detector = LanguageDetectorDL.pretrained("ld_wiki_tatoeba_cnn_21") \
    .setInputCols(["document"]) \
    .setOutputCol("LANG") \
    .setThreshold(0.8) \
    .setCoalesceSentences(True)

ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7,1 MB
[OK!]


In [9]:
marian = MarianTransformer.pretrained("opus_mt_cs_en", "xx") \
    .setInputCols(["document"]) \
    .setOutputCol("translation")

opus_mt_cs_en download started this may take some time.
Approximate size to download 386,5 MB
[OK!]


In [10]:
sentence = SentenceDetector() \
    .setInputCols("translation") \
    .setOutputCol('sentence')

In [11]:
tokenizer = Tokenizer() \
    .setInputCols('sentence') \
    .setOutputCol('token')

In [12]:
stop_words = StopWordsCleaner.pretrained("stopwords_en", "en") \
        .setInputCols(["token"]) \
        .setOutputCol("cleanTokens")

stopwords_en download started this may take some time.
Approximate size to download 2,9 KB
[OK!]


In [13]:
lemmatizer = Lemmatizer() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma") \
    .setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")

In [14]:
pos = PerceptronModel.pretrained("pos_ud_ewt", "en") \
  .setInputCols(["translation", "token"]) \
  .setOutputCol("pos")

pos_ud_ewt download started this may take some time.
Approximate size to download 2,2 MB
[OK!]


In [15]:
sentiment_detector = ViveknSentimentModel.pretrained() \
    .setInputCols(["translation", "token"]) \
    .setOutputCol("sentiment_result")

sentiment_vivekn download started this may take some time.
Approximate size to download 873,6 KB
[OK!]


In [16]:
nlp_pipeline = Pipeline(stages=[document,language_detector,marian,sentence,tokenizer,stop_words, lemmatizer,pos,sentiment_detector])
model = nlp_pipeline.fit(data)
result = model.transform(data)

In [17]:
res = result.selectExpr('ID','OTAZKA','PEDAGOG','HODNOTA_CLOB_OCISTENO','LANG.result as LANG','token.result as token','lemma.result as lemma','pos.result as pos','sentiment_result.result as DetectSentiment','SENTIMENT')

In [18]:
type(res)

pyspark.sql.dataframe.DataFrame

In [19]:
res_expPandasDF = res.toPandas()

In [23]:
res_expPandasDF.drop(columns = "Length",inplace = True)

In [24]:
res_expPandasDF.head()

Unnamed: 0,ID,OTAZKA,PEDAGOG,HODNOTA_CLOB_OCISTENO,LANG,token,lemma,pos,DetectSentiment,SENTIMENT
0,933,4,,pravděpodobně odstranění připouštěcího testu k...,[cs],"[probably, removing, the, pass, test, for, the...","[remove, pass, test, test, ,, test, farce, ,, ...","[ADV, VERB, DET, NOUN, NOUN, ADP, DET, NOUN, P...",[positive],Negative
1,940,3,,-nepovinná docházka s ústní zkouškou na konci,[cs],"[-, an, optional, attendance, with, a, oral, e...","[-, optional, attendance, oral, exam, end]","[PUNCT, DET, ADJ, NOUN, ADP, DET, ADJ, NOUN, A...",[negative],Positive
2,200,4,,"Minimální překrytí s agilním vývojem, byl to s...",[cs],"[Minimum, overlap, with, agile, development, ,...","[Minimum, overlap, agile, development, ,, hard...","[ADJ, NOUN, ADP, NOUN, NOUN, PUNCT, PRON, AUX,...",[negative],Negative
3,206,3,,"Fajn pro lidi co chtějí umět jazyk react, host...",[cs],"[Good, for, people, who, want, to, learn, the,...","[Good, people, learn, language, Fact, ,, guest...","[ADJ, ADP, NOUN, PRON, VERB, PART, VERB, DET, ...",[positive],Positive
4,2344,4,,"lepší koncepce cvičení, občas mi přišlo, že se...",[cs],"[better, concept, of, exercise, ,, sometimes, ...","[concept, exercise, ,, think, exercise, base, ...","[ADJ, NOUN, ADP, NOUN, PUNCT, ADV, PRON, VERB,...",[positive],Negative


In [None]:
#print(res_expPandasDF.iloc[1,4], + len(res_expPandasDF.iloc[1,4]))

In [21]:
listWithLength = []
for i in range(len(res_expPandasDF)):
    x = len(res_expPandasDF.iloc[i,4])
    listWithLength.append(x)

res_expPandasDF["Length"] = listWithLength

In [26]:
res_expPandasDF.to_csv(r'C:\Users\zbyne\Documents\čokoláda\VŠE\datový projekt\Textová analytika - Projekt\BaseCleanUp\EnglishLemmaCloud.csv',encoding="utf-8")

In [47]:
#ShrinkedDF.to_csv(r'C:\Users\zbyne\Documents\čokoláda\VŠE\datový projekt\Textová analytika - Projekt\BaseCleanUp\EnglishCloudWithLimitedLength.csv')