In [1]:
import pandas as pd

In [2]:
# read csv data
df = pd.read_csv('results/top_contribuyentes.csv')

In [3]:
# 0. Drop Unnamed column and index DF column
df = df.drop('Unnamed: 0', axis=1)

#### More cleaning

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   RUC                    4000 non-null   float64
 1   NOMBRE O RAZON SOCIAL  4500 non-null   object 
 2   APORTE TOTAL(mm)       4500 non-null   int64  
 3   ANHO                   4500 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 140.8+ KB


In [5]:
# replace NaN values on column RUC with 0
df['RUC'] = df['RUC'].fillna(0)
# parse RUC column to int and then to string
df['RUC'] = df['RUC'].astype(int).astype(str)
# parse ANHO column to int
df['ANHO'] = df['ANHO'].astype(int)

In [6]:
# order DF by APORTE TOTAL(mm) per every year
df = df.sort_values(by=['ANHO', 'APORTE TOTAL(mm)'], ascending=False)
# add a new column with the position of the contributor per year
df['POSICION'] = df.groupby('ANHO').cumcount() + 1
df.head(20)

Unnamed: 0,RUC,NOMBRE O RAZON SOCIAL,APORTE TOTAL(mm),ANHO,POSICION
0,80009735,ADMINISTRACION NACIO,336481,2021,1
1,80008790,TABACALERA DEL ESTE,198331,2021,2
2,80002201,BANCO ITAU PARAGUAY,174807,2021,3
3,80003400,PARAGUAY REFRESCOS S,147677,2021,4
4,80086846,CERVEPAR S.A.,146989,2021,5
5,80019270,BANCO CONTINENTAL SA,142276,2021,6
6,80000856,BANCO NACIONAL DE FO,139376,2021,7
7,80009310,VISION BANCO S.A.E.C,93627,2021,8
8,80034461,SUDAMERIS BANK SAECA,92594,2021,9
9,80003457,CERVECERIA PARAGUAYA,85477,2021,10


#### Ya que esta ordenado y con su posicion, solamente buscamos los que quedaron en el top 10

In [7]:
# Find the top 10 contributors per year
df_top10 = df[df['POSICION'] <= 10]
df_top10.head(25)

Unnamed: 0,RUC,NOMBRE O RAZON SOCIAL,APORTE TOTAL(mm),ANHO,POSICION
0,80009735,ADMINISTRACION NACIO,336481,2021,1
1,80008790,TABACALERA DEL ESTE,198331,2021,2
2,80002201,BANCO ITAU PARAGUAY,174807,2021,3
3,80003400,PARAGUAY REFRESCOS S,147677,2021,4
4,80086846,CERVEPAR S.A.,146989,2021,5
5,80019270,BANCO CONTINENTAL SA,142276,2021,6
6,80000856,BANCO NACIONAL DE FO,139376,2021,7
7,80009310,VISION BANCO S.A.E.C,93627,2021,8
8,80034461,SUDAMERIS BANK SAECA,92594,2021,9
9,80003457,CERVECERIA PARAGUAYA,85477,2021,10


In [8]:
df_top10.tail(20)

Unnamed: 0,RUC,NOMBRE O RAZON SOCIAL,APORTE TOTAL(mm),ANHO,POSICION
3500,80003457,CERVECERIA PARAGUAYA,286676,2014,1
3501,80009735,ADMINISTRACION NACIO,229278,2014,2
3502,80008790,TABACALERA DEL ESTE,225700,2014,3
3503,80000519,TELEF. CELULAR DEL P,223909,2014,4
3504,80002201,BANCO ITAU PARAGUAY,127110,2014,5
3505,80019270,BANCO CONTINENTAL SA,109687,2014,6
3506,80003400,PARAGUAY REFRESCOS S,106160,2014,7
3507,80021825,BRASFUMO DEL PARAGUA,86768,2014,8
3508,80016096,RETAIL S.A.,83937,2014,9
3509,80007801,AGRO SILO SANTA CATA,82765,2014,10


# Encontrar cuantos son Personas Fisicas y cuantos son Personas Juridicas

### Buscar en base al Nro de RUC

In [9]:
# DF con todos los valores distintos de la columna RUC que empiecen con 8
df_top10_ruc = df[df['RUC'].str.startswith('8')]
df_top10_ruc.head()

Unnamed: 0,RUC,NOMBRE O RAZON SOCIAL,APORTE TOTAL(mm),ANHO,POSICION
0,80009735,ADMINISTRACION NACIO,336481,2021,1
1,80008790,TABACALERA DEL ESTE,198331,2021,2
2,80002201,BANCO ITAU PARAGUAY,174807,2021,3
3,80003400,PARAGUAY REFRESCOS S,147677,2021,4
4,80086846,CERVEPAR S.A.,146989,2021,5


In [10]:
# DF con todos los valores distintos de la columna RUC que no empiecen con 8
df_top10_no_ruc = df[~df['RUC'].str.startswith('8')]

In [11]:
# Crear una columna con F o J dependiendo de si el RUC  

## Con excel, rellenar datos faltantes RUC