In [5]:
#!pip install tokenizers
#!pip install gensim

In [7]:
import os
import pandas as pd

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import sagemaker
from sagemaker import get_execution_role
import sagemaker_pyspark

#Visualizaciones y Varios
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import boto3
role = get_execution_role()

jars = sagemaker_pyspark.classpath_jars()
classpath = ":".join(sagemaker_pyspark.classpath_jars())
spark = (SparkSession.builder
         .config("spark.driver.extraClassPath", classpath)
         .config("spark.executor.memory","60g")
         .config("spark.driver.memory","60g")
         .config("spark.driver.maxResultSize", "0")
         .master("local[*]").getOrCreate())

spark.conf.set("spark.sql.caseSensitive", "true")

In [8]:
bucket = 'politicos-dataset-us-east-1'
region = 'us-east-1'
prefix = 'gpt2_fine_tune/tweets_politicos'
spark._jsc.hadoopConfiguration().set('fs.s3a.endpoint', 's3.{}.amazonaws.com'.format(region))

In [10]:
SDF = spark.read.parquet('s3a://{}/{}/parquet'.format(bucket, prefix))

In [11]:
SDF.count()

170306

In [12]:
df = SDF.toPandas()
df

Unnamed: 0,id_str,hashtags,screen_name,target_id,text,entities,key_phrases,sentiment,dt,Partido,Coalicion,text_clean,tokenized
0,1134966472260931585,"[CuentaP√∫blica, ChileenMarcha]",_alecandia,57,Ya estamos en el Congreso con los subses @Caro...,"[(42, 52, 0.990331768989563, CarolCBown, PERSO...",[],NEUTRAL,2019-06-01 23:34:28,IND-GOB,Chile Vamos,Ya estamos en el Congreso con los subses @Caro...,"[ya, estamos, en, el, congreso, con, los, subs..."
1,1134991775045685248,"[Araucan√≠a, CuentaP√∫blica]",nonaossandon,681,"‚≠ï ""Combatir con m√°xima voluntad y firmeza, sie...","[(279, 302, 0.9772765636444092, https://t.co/A...",[],NEGATIVE,2019-06-02 01:15:00,RN,Chile Vamos,"‚≠ï ""Combatir con m√°xima voluntad y firmeza, sie...","[‚≠ï, ""combatir, con, m√°xima, voluntad, y, firme..."
2,1135003138753277956,[CuentaP√∫blica],feguigurenc,176,#CuentaP√∫blica\nüì¢ANUNCIO| Nuestro Presidente a...,"[(33, 43, 0.7224394679069519, Presidente, PERS...","[(0, 14, 0.9999927282333374, #CuentaP√∫blica), ...",NEUTRAL,2019-06-02 02:00:10,RN,Chile Vamos,#CuentaP√∫blica\nüì¢ANUNCIO| Nuestro Presidente a...,"[#cuentap√∫blica, üì¢anuncio|, nuestro, president..."
3,1135008118277124096,[CuentaP√∫blica],ANDRESCELISM,240,El agua es un tema muy importante para nuestra...,"[(93, 111, 0.9997791647911072, 26 nuevos embal...","[(11, 18, 0.656255841255188, un tema)]",NEUTRAL,2019-06-02 02:19:57,RN,Chile Vamos,El agua es un tema muy importante para nuestra...,"[el, agua, es, un, tema, muy, importante, para..."
4,1135008271104974848,[Cuentapublica],Diego_Schalper,287,Bien l√≠mite a la reelecci√≥n. Ojal√° reforma al ...,"[(94, 101, 0.7375357747077942, semanas, QUANTI...","[(5, 11, 0.8332874178886414, l√≠mite), (14, 27,...",NEUTRAL,2019-06-02 02:20:33,RN,Chile Vamos,Bien l√≠mite a la reelecci√≥n. Ojal√° reforma al ...,"[bien, l√≠mite, a, la, reelecci√≥n., ojal√°, refo..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
170301,1285265827017756672,"[CuidaTuVida, COVID19]",MuniColina,601,#CuidaTuVida | ¬°C√≥rtala de una vez!\nEn tiempo...,"[(27, 34, 0.7483243942260742, una vez, QUANTIT...","[(0, 12, 0.9999967813491821, #CuidaTuVida), (2...",NEGATIVE,2020-07-20 17:30:26,UDI,Chile Vamos,#CuidaTuVida | ¬°C√≥rtala de una vez!\nEn tiempo...,"[#cuidatuvida, |, ¬°c√≥rtala, de, una, vez!, en,..."
170302,1285296001125888001,[MercadoCampesino],MuniColina,601,El #MercadoCampesino de @INDAP_Chile ya cuenta...,"[(24, 36, 0.6468220949172974, @INDAP_Chile, OR...","[(3, 20, 0.9999831914901733, #MercadoCampesino...",NEUTRAL,2020-07-20 19:30:20,UDI,Chile Vamos,El #MercadoCampesino de @INDAP_Chile ya cuenta...,"[el, #mercadocampesino, de, @indap_chile, ya, ..."
170303,1285663758933786624,[],renesaffirio,476,Hoy C√°mara Diputados aprob√≥ y despach√≥ a ley p...,"[(0, 3, 0.9184558391571045, Hoy, DATE), (4, 20...","[(4, 20, 0.8305284976959229, C√°mara Diputados)...",NEUTRAL,2020-07-21 19:51:41,IND,Independiente,Hoy C√°mara Diputados aprob√≥ y despach√≥ a ley p...,"[hoy, c√°mara, diputados, aprob√≥, y, despach√≥, ..."
170304,1285690326909427714,[],HarryJurgensen,503,Nos reunimos con alcaldes de la regi√≥n para da...,"[(17, 25, 0.5282663702964783, alcaldes, PERSON...",[],NEUTRAL,2020-07-21 21:37:15,RN,Chile Vamos,Nos reunimos con alcaldes de la regi√≥n para da...,"[nos, reunimos, con, alcaldes, de, la, regi√≥n,..."


In [15]:
#!pip install fastparquet

In [18]:
prefix = 'gpt2_fine_tune/tweets_politicos/pandas/parquet'
df.to_parquet('s3://{}/{}'.format(bucket, prefix), compression='GZIP')