In [1]:
import os
import pandas as pd

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import sagemaker
from sagemaker import get_execution_role
import sagemaker_pyspark

import boto3
role = get_execution_role()

In [2]:
jars = sagemaker_pyspark.classpath_jars()
classpath = ":".join(sagemaker_pyspark.classpath_jars())
spark = (SparkSession.builder
         .config("spark.driver.extraClassPath", classpath)
         .config("spark.executor.memory","60g")
         .config("spark.driver.memory","60g")
         .config("spark.driver.maxResultSize", "0")
         .master("local[*]").getOrCreate())

spark.conf.set("spark.sql.caseSensitive", "true")

In [3]:
spark

In [4]:
bucket = 'politicos-dataset-us-east-1'
region = 'us-east-1'
prefix = 'gpt2_fine_tune/tweets_derecha/parquet/'
spark._jsc.hadoopConfiguration().set('fs.s3a.endpoint', 's3.{}.amazonaws.com'.format(region))

In [5]:
SDF = spark.read.parquet('s3a://{}/{}'.format(bucket, prefix))

In [6]:
SDF.count()

31631

## Make Datasets

In [7]:
df = SDF.toPandas()
df

Unnamed: 0,screen_name,text_clean,Partido,Coalicion,dt
0,evelynmatthei,Gracias al apoyo de @cbsantiago y de la @Serem...,UDI,Chile Vamos,2019-06-04 15:26:43
1,SanPedroMuni,Centros de Madres y Talleres realizan exitosa ...,UDI,Chile Vamos,2019-06-12 16:11:16
2,vtroncosoh,Hoy particip√© en la conmemoraci√≥n del ‚ÄúD√≠a Int...,UDI,Chile Vamos,2019-06-12 20:01:44
3,SanPedroMuni,PATIO DEL LICEO MUNICIPAL de #SanPedro se encu...,UDI,Chile Vamos,2019-06-14 13:38:41
4,javiermacaya,@mcubillossigall a puro car√°cter y sin vacilar...,UDI,Chile Vamos,2019-06-19 02:08:42
...,...,...,...,...,...
31626,Alvaro_CarterF,Quiero desear un muy feliz #DiaDelPeriodista a...,UDI,Chile Vamos,2020-07-11 15:24:30
31627,SanAntonio_Gob,‚≠ïÔ∏è Gobernadora @gaby_alcalde sostuvo encuentro...,UDI,Chile Vamos,2020-07-17 19:39:18
31628,laSUBDERE,üìπEl subsecretario @juanmasferrer explica en qu...,UDI,Chile Vamos,2020-07-19 21:00:00
31629,MuniColina,#CuidaTuVida | ¬°C√≥rtala de una vez!\nEn tiempo...,UDI,Chile Vamos,2020-07-20 17:30:26


In [8]:
from sklearn.model_selection import train_test_split
import re

def build_text_files(tweets, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for tweet in tweets:
        text = str(tweet).strip()
        text = re.sub(r"\n", " ", text)
        data += text + "  "
    f.write(data)

In [9]:
tweets = df['text_clean']

In [10]:
train, test = train_test_split(tweets,test_size=0.15)
build_text_files(train,'train_dataset_der.txt')
build_text_files(test,'test_dataset_der.txt')
len(train), len(test)

(26886, 4745)

## 