In [28]:
import numpy as np
import pandas as pd
import datetime as dt

import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns

from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

from scipy import stats
import math
from scipy.stats import levene
import researchpy as rp
from sklearn.preprocessing import StandardScaler
import itertools

import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

plt.rcParams["figure.figsize"] = (10,8)

In [29]:
popul_df = pd.read_csv('csv/acnh_villager_data.csv')
popul_df.head()

Unnamed: 0,tier,rank,name
0,1,1,Raymond
1,1,2,Marshal
2,1,3,Shino
3,1,4,Sherb
4,1,5,Sasha


### Créditos a EMPYRR en Kaggle por la depuración:

[Fuente](https://www.kaggle.com/code/ampiiere/animal-crossing-villager-popularity-analysis)



In [30]:
vecines = pd.read_csv('csv/vecines_tipo.csv')
vecines.head()

Unnamed: 0,ID,Name,Species,Gender,Personality,Subtype,Hobby,Birthday,Style 1,Style 2,Color 1,Color 2,Type
0,0,Admiral,Bird,Male,Cranky,A,Nature,1-27,Cool,Cool,Black,Blue,Bird
1,1,Agent S,Squirrel,Female,Peppy,B,Fitness,7-2,Active,Simple,Blue,Black,Mammal
2,2,Agnes,Pig,Female,Big Sister,A,Play,4-21,Simple,Elegant,Pink,White,Mammal
3,3,Al,Gorilla,Male,Lazy,B,Fitness,10-18,Active,Active,Red,White,Mammal
4,4,Alfonso,Alligator,Male,Lazy,B,Play,6-9,Simple,Simple,Red,Blue,Amphibian


In [31]:
#hay 27 nombres que no están en la lista
no_match_names = popul_df["name"][popul_df["name"].isin(vecines["Name"]) == False]
len(no_match_names)

27

In [32]:
#hay 413 nombres en el df de popu antes de borrarlos
len(popul_df['name'].unique())

413

In [33]:
# Data set is small enough to pick out the same names
# Correcting names in popul_df to match vlgr_df
popul_df['name'] = popul_df['name'].replace(['OHare'],"O\'Hare")
popul_df['name'] = popul_df['name'].replace(['Buck(Brows)'],"Buck")
popul_df['name'] = popul_df['name'].replace(['Renee'],"Renée")
popul_df['name'] = popul_df['name'].replace(['WartJr'],"Wart Jr.")
popul_df['name'] = popul_df['name'].replace(['Crackle(Spork)'],"Spork")

In [34]:
#vamos a ver cómo es ahora la lista de larga <3 
no_match_names = popul_df["name"][popul_df["name"].isin(vecines["Name"]) == False]
len(no_match_names)

22

In [35]:
popul_df['name'] = popul_df['name'][~popul_df['name'].isin(no_match_names)]

In [36]:
popul_df['name'].to_list() == vecines['Name'].to_list()

False

In [37]:
list(set(popul_df['name'].to_list()).symmetric_difference(set(vecines['Name'].to_list())))

[nan]

In [38]:
popul_df.isnull().sum()

tier     0
rank     0
name    22
dtype: int64

Tras revisarlo, vemos que son los valores nulos los que hacen que los valores en ambas columnas no sean los mismos, así que vamos a borrarlos:

In [39]:
popul_df.dropna(inplace=True)

In [40]:
popul_df.isnull().sum()

tier    0
rank    0
name    0
dtype: int64

In [41]:
#esto sale falso, pero tienen la misma longitud y no se detectan diferencias con el symmetric diference,
#así que lo dejamos así :-)
popul_df['name'].to_list() == vecines['Name'].to_list()

False

In [42]:
len(popul_df['name']) == len(vecines['Name'])

True

In [43]:
list(set(popul_df['name'].to_list()).symmetric_difference(set(vecines['Name'].to_list())))

[]

In [44]:
#guardamos en un csv just in case:
popul_df.to_csv('csv/popularidad.csv')

¡Mergeamos!

In [45]:
# drop villagers that are in popul_df but not in vlgr_df
popul_df = popul_df.drop(popul_df[popul_df["name"].isin(vecines["Name"]) == False].index)

In [46]:
# Now that both df have same length, we can set index as names and combine the 2 dfs
popul_df.set_index('name', drop=True, inplace=True)
vecines.set_index('Name', drop=True, inplace=True)

In [47]:
vecines_popus = pd.merge(vecines, popul_df, right_index=True, left_index=True)

In [48]:
vecines_popus.columns

Index(['ID', 'Species', 'Gender', 'Personality', 'Subtype', 'Hobby',
       'Birthday', 'Style 1', 'Style 2', 'Color 1', 'Color 2', 'Type', 'tier',
       'rank'],
      dtype='object')

In [49]:
#esto es un ranking dentro de su tier, pero nos interesa también un rank en general
vecines_popus.sort_values(['tier', 'rank'], inplace=True)
vecines_popus['rank_overall'] = np.arange(1, len(vecines_popus)+1)

In [50]:
vecines_popus['Birthday'] =  pd.to_datetime(vecines_popus['Birthday'], format='%m-%d')

In [51]:
vecines_popus.head()

Unnamed: 0,ID,Species,Gender,Personality,Subtype,Hobby,Birthday,Style 1,Style 2,Color 1,Color 2,Type,tier,rank,rank_overall
Raymond,298,Cat,Male,Smug,B,Nature,1900-10-01,Elegant,Cool,Black,Gray,Mammal,1,1,1
Marshal,228,Squirrel,Male,Smug,A,Music,1900-09-29,Elegant,Cool,Light blue,Blue,Mammal,1,2,2
Sherb,329,Goat,Male,Lazy,A,Nature,1900-01-18,Simple,Cute,Gray,Blue,Mammal,1,4,3
Ankha,12,Cat,Female,Snooty,B,Nature,1900-09-22,Gorgeous,Simple,Colorful,Brown,Mammal,1,7,4
Zucker,390,Octopus,Male,Lazy,A,Nature,1900-03-08,Simple,Cute,Blue,Yellow,Amphibian,1,8,5


In [52]:
vecines_popus.to_csv('csv/vecines_popus_tipos.csv')