# Reddit Spanish Sample

Run the Python script to generate a list of pandas files with Spanish comments from Reddit before running this notebook

In [1]:
import os 

import numpy as np
import pandas as pd

from glob import glob

## Build a single sample file with 10k posts from each subreddit

In [3]:
input_directory = '/shared/3/projects/hiatus/multilingual/reddit-spanish/'
out_file = os.path.join(input_directory, 'spanish_subreddit_samples.parquet.gzip')
parquet_files = glob(input_directory + '*.gzip')
len(parquet_files)

11

In [4]:
frames = []

for fp in parquet_files:
    frames.append(pd.read_parquet(fp))

df = pd.concat(frames)
df.head(3)

Unnamed: 0,file,author,subreddit,created_utc,link_id,parent_id,text
0,/shared/2/datasets/reddit-dump-all/RC/RC_2023-...,[deleted],chile,1672531000.0,t3_1001o85,t1_j2fxalg,[removed]
1,/shared/2/datasets/reddit-dump-all/RC/RC_2023-...,Tierrrez,chile,1672531000.0,t3_1006r7b,t3_1006r7b,no se si la calidad del video te permite ver b...
2,/shared/2/datasets/reddit-dump-all/RC/RC_2023-...,conrick,mexico,1672531000.0,t3_1002np5,t3_1002np5,Le salio mal el capricho.


In [5]:
len(df)

3628240

In [6]:
df = df[['author', 'subreddit', 'text']]

In [7]:
# Assume df is your DataFrame
sampled_df = df.groupby('subreddit').apply(lambda x: x.sample(n=min(len(x), 10000), random_state=42)).reset_index(drop=True)
len(sampled_df)

320824

In [8]:
sampled_df.to_parquet(out_file, compression='gzip')

In [9]:
sampled_df['subreddit'].value_counts().to_dict()

{'ArgEntos': 10000,
 'Panama': 10000,
 'PuertoRico': 10000,
 'SpainPolitics': 10000,
 'SquarePosting': 10000,
 'argentina': 10000,
 'chile': 10000,
 'dankgentina': 10000,
 'ecuador': 10000,
 'es': 10000,
 'espanol': 10000,
 'memexico': 10000,
 'mexico': 10000,
 'preguntaleareddit': 10000,
 'spain': 10000,
 'uruguay': 10000,
 'vzla': 10000,
 'Paraguay': 10000,
 'yo_elvr': 10000,
 'PERU': 10000,
 'LigaMX': 10000,
 'Colombia': 10000,
 'Mujico': 10000,
 'Asi_va_Espana': 10000,
 'BOLIVIA': 10000,
 'MAAU': 10000,
 'Ticos': 9594,
 'WriteStreakES': 8276,
 'Dominican': 8015,
 'Spanishhelp': 7021,
 'Desahogo': 4395,
 'RedditPregunta': 3948,
 'programacion': 3345,
 'fulbo': 2456,
 'LaLiga': 2297,
 'VideojuegosMX': 1763,
 'futbol': 1645,
 'latinoamerica': 1360,
 'yo_ctm': 1350,
 'videojuego': 774,
 'libros': 706,
 'redditores': 613,
 'HistoriasdeTerror': 590,
 'HistoriasDeReddit': 510,
 'filosofia_en_espanol': 505,
 'fisica': 434,
 'latinos': 387,
 'cuentaleareddit': 195,
 'Cinefilos': 142,
 'futb