In [None]:
!pip install gdown

In [None]:
!gdown --id 1uRgJ5MzqoGh-XYQUFAVBUGQYlLi7aMXx

In [None]:
!unzip ABSA_glove_absa.zip

In [None]:
# -*- coding: utf-8 -*-

import os
import json
os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ['PATH'] = os.environ['JAVA_HOME'] + "/bin:" + os.environ['PATH']

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import sparknlp
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline

from sparknlp.training import CoNLL
import pyspark.sql.functions as F

# Start Spark session
spark = sparknlp.start(gpu=True)

In [None]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['document'])\
    .setOutputCol('token')

glove_embeddings = WordEmbeddingsModel.pretrained("glove_840B_300", "xx")\
    .setInputCols(["document", "token"])\
    .setOutputCol("embeddings")
    
loaded_ner_model = NerDLModel.load("ABSA_glove_absa")\
    .setInputCols(["document", "token", "embeddings"])\
    .setOutputCol("absa")

converter = NerConverter()\
    .setInputCols(["document", "token", "absa"])\
    .setOutputCol("absa_span")

ner_prediction_pipeline = Pipeline(
    stages = [
        document,
        token,
        glove_embeddings,
        loaded_ner_model,
        converter])

empty_data = spark.createDataFrame([['']]).toDF("text")
prediction_model = ner_prediction_pipeline.fit(empty_data)
sent_pipeline = Pipeline(
    stages = [document, sentence]
)

In [None]:
input_files = os.listdir('../inputs/')

In [None]:
input_paths = [os.path.join('../inputs/', file) for file in input_files]
output_paths = [os.path.join('../outputs/', file.replace('txt', 'csv')) for file in input_files]

In [None]:
input_paths

In [None]:
for in_path, out_path in zip(input_paths, output_paths):
    text = open(in_path).read()
    df = spark.createDataFrame(pd.DataFrame({'text': [text]}))
    df1 = prediction_model.transform(df).toPandas()
    df2 = sent_pipeline.fit(empty_data).transform(df).toPandas()

    all_sents = df2['sentence'][0]

    sentences = []
    aspects = []
    sentiments = []
    for result in df1['absa_span'][0]:
        start, end = result['begin'], result['end']
        for sent in all_sents:
            if sent['begin'] <= start and sent['end'] >= end:
                sentences.append(sent['result'])
        aspects.append(result['result'])
        sentiment = "positive" if result['metadata']['entity'] == "POS" else "negative"
        sentiments.append(sentiment)
    final_result = pd.DataFrame.from_dict({"sentence": sentences, "aspect": aspects, "sentiment": sentiments})
    final_result.to_csv(out_path, index=False)

In [None]:
!ls ../outputs

In [None]:
df = pd.read_csv("../outputs/Example10.csv")

In [None]:
df.head()