In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from numpy.linalg import eig

# Ejercicio 1

## Cálculo de estadísticas

In [12]:
# lectura de datos
gas_miles = pd.read_csv('data/carc B.txt', sep=' ', header=None)
gas_miles.columns = ['lts_gas', 'miles']
gas_miles.head()

Unnamed: 0,lts_gas,miles
0,1,22
1,1,17
2,1,22
3,2,17
4,2,23


In [13]:
# transformación de datos para obtener millas por litros
gas_miles.eval(r'miles_per_liter = miles / lts_gas', inplace=True)
gas_miles.head()

Unnamed: 0,lts_gas,miles,miles_per_liter
0,1,22,22.0
1,1,17,17.0
2,1,22,22.0
3,2,17,8.5
4,2,23,11.5


In [14]:
gas_miles.shape

(74, 3)

In [16]:
gas_miles['miles_per_liter'].describe()

count    74.000000
mean     16.979730
std       6.178346
min       6.000000
25%      12.125000
50%      18.000000
75%      20.875000
max      34.000000
Name: miles_per_liter, dtype: float64

In [17]:
# primer y tercer cuartil
F_U = 20.875
F_L = 12.125

# f-dispersion
df = F_U - F_L

# barras externas
b_U = F_U + 1.5 * df
b_L = F_L - 1.5 * df

In [18]:
print(b_U, b_L)

34.0 -1.0


In [20]:
gas_miles.query('miles_per_liter <= @b_U')['miles_per_liter'].max()

34.0

In [21]:
gas_miles.query('miles_per_liter >= @b_L')['miles_per_liter'].min()

6.0

In [22]:
gas_miles['miles_per_liter'].mean()

16.97972972972973

## Box plots

In [43]:
fig = go.Figure(
    go.Box(
        x=gas_miles['miles_per_liter'],
        name=' ',
        boxmean=True,
        marker_size=6,
        whiskerwidth=0.2,
        boxpoints='all',
        marker_color='rgb(107,174,214)',
        marker_line_outliercolor='rgba(219, 64, 82, 0.6)'
    )
)

fig.update_layout(
    xaxis=dict(
        title='Millas por litro',
        title_font_size=18
    ),
    margin=dict(
        l=40,
        r=10,
        b=10,
        t=40,
    ),
    height=300,
    width=700,
    showlegend=False,
    template='plotly_white',
)

fig.write_image('images/miles_per_liter.pdf')

fig.show()

In [45]:
gas_miles.query('miles_per_liter > 18').count()

lts_gas            31
miles              31
miles_per_liter    31
dtype: int64

# Ejercicio 2

In [2]:
bills = pd.read_csv('data/SwissBank 1.txt', sep='\t', header=None)
bills.columns = ['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6']
bills.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6
0,214.8,131.0,131.1,9.0,9.7,141.0
1,214.6,129.7,129.7,8.1,9.5,141.7
2,214.8,129.7,129.7,8.7,9.6,142.2
3,214.8,129.7,129.6,7.5,10.4,142.0
4,215.0,129.6,129.7,10.4,7.7,141.8


In [3]:
true_bills = bills.iloc[:100, :]
false_bills = bills.iloc[100:, :]

In [8]:
fig = go.Figure()

fig.add_traces(
    go.Box(
        x=true_bills['X_1'],
        name='Genuinos',
        boxmean=True,
        marker_size=3,
        boxpoints='all',
        whiskerwidth=0.2,
        marker_color = 'lightseagreen',
        marker_line_outliercolor='rgba(219, 64, 82, 0.6)'
    )    
)

fig.add_traces(
    go.Box(
        x=false_bills['X_1'],
        name='Falsificaciones',
        boxmean=True,
        marker_size=3,
        boxpoints='all',
        whiskerwidth=0.2,
        marker_color = 'indianred',
        marker_line_outliercolor='rgba(219, 64, 82, 0.6)'
    )    
)

fig.update_layout(
    xaxis=dict(
        title='Largo del billete',
        title_font_size=18
    ),
    margin=dict(
        l=40,
        r=10,
        b=10,
        t=40,
    ),
    height=400,
    width=800,
    showlegend=False,
    template='plotly_white',
)

# fig.write_image('images/bills.pdf')

fig.show()

In [9]:
true_bills['X_6'].describe()

count    100.000
mean     141.517
std        0.447
min      139.600
25%      141.200
50%      141.500
75%      141.800
max      142.400
Name: X_6, dtype: float64

In [16]:
false_bills['X_6'].describe()

count    100.000000
mean     139.450000
std        0.557864
min      137.800000
25%      139.200000
50%      139.500000
75%      139.800000
max      140.600000
Name: X_6, dtype: float64

In [10]:
F_U = 141.8
F_L = 141.2

df = F_U - F_L

b_U = F_U + 1.5 * df
b_L = F_L - 1.5 * df

print(b_U)
print(b_L)

142.70000000000005
140.29999999999995


In [17]:
F_U = 139.8
F_L = 139.2

df = F_U - F_L

b_U = F_U + 1.5 * df
b_L = F_L - 1.5 * df

print(b_U)
print(b_L)

140.70000000000005
138.29999999999995


In [13]:
true_bills.query('X_6 <= 142.7')['X_6'].max()

142.4

In [14]:
true_bills.query('X_6 >= 140.3')['X_6'].min()

140.6

In [18]:
false_bills.query('X_6 <= 140.7')['X_6'].max()

140.6

In [20]:
false_bills.query('X_6 >= 138.3')['X_6'].min()

138.3

In [5]:
fig = go.Figure()

fig.add_traces(
    go.Box(
        x=true_bills['X_6'],
        name='Genuinos',
        boxmean=True,
        marker_size=3,
        boxpoints='all',
        whiskerwidth=0.2,
        marker_color = 'lightseagreen',
        marker_line_outliercolor='rgba(219, 64, 82, 0.6)'
    )    
)

fig.add_traces(
    go.Box(
        x=false_bills['X_6'],
        name='Falsificaciones',
        boxmean=True,
        marker_size=3,
        boxpoints='all',
        whiskerwidth=0.2,
        marker_color = 'indianred',
        marker_line_outliercolor='rgba(219, 64, 82, 0.6)'
    )    
)

fig.update_layout(
    xaxis=dict(
        title='Diagonal del billete',
        title_font_size=18
    ),
    margin=dict(
        l=40,
        r=10,
        b=10,
        t=40,
    ),
    height=400,
    width=800,
    showlegend=False,
    template='plotly_white',
)

fig.write_image('images/bills_X6.pdf')

fig.show()

In [27]:
fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x=true_bills['X_6'],
        name='Genuinos',
        xbins=dict(
            size=0.1
        )
    )
)

fig.add_trace(
    go.Histogram(
        x=false_bills['X_6'],
        name='Falsificaciones',
        xbins=dict(
            size=0.1
        )
    )
)

fig.update_traces(opacity=0.5)

fig.update_layout(
    title='Distribución de las medidas de<br>las diagonales de billetes reales y falsos',
    title_font_size=22,
    xaxis=dict(
        title='Diagonal del billete',
        title_font_size=18
    ),
    yaxis=dict(
        title='Cuenta',
        title_font_size=18
    ),
    margin=dict(
        l=40,
        r=10,
        b=10,
        t=40,
    ),
    height=500,
    width=800,
    barmode='overlay',
    template='plotly_white',
)

# fig.write_image('images/hist_tf.pdf')

fig.show()

In [26]:
fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x=bills['X_6'],
        name='Genuinos',
        xbins=dict(
            size=0.1
        )
    )
)

fig.update_traces(opacity=0.5)

fig.update_layout(
    title='Distribución de las medidas de<br>las diagonales de billetes',
    title_font_size=22,
    xaxis=dict(
        title='Diagonal del billete',
        title_font_size=18
    ),
    yaxis=dict(
        title='Cuenta',
        title_font_size=18
    ),
    margin=dict(
        l=40,
        r=10,
        b=10,
        t=40,
    ),
    height=500,
    width=800,
    barmode='overlay',
    showlegend=False,
    template='plotly_white',
)

fig.write_image('images/hist_bills.pdf')

fig.show()

# Ejercicio 4

In [4]:
center_bills = bills - bills.mean()
cov_matrix = np.cov(center_bills.T)
w, v = eig(cov_matrix)

In [29]:
bills.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6
0,214.8,131.0,131.1,9.0,9.7,141.0
1,214.6,129.7,129.7,8.1,9.5,141.7
2,214.8,129.7,129.7,8.7,9.6,142.2
3,214.8,129.7,129.6,7.5,10.4,142.0
4,215.0,129.6,129.7,10.4,7.7,141.8


In [30]:
scaled = bills.copy()

In [31]:
scaled[['X_1', 'X_2', 'X_3', 'X_6']] = bills[['X_1', 'X_2', 'X_3', 'X_6']] / 10
scaled.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6
0,21.48,13.1,13.11,9.0,9.7,14.1
1,21.46,12.97,12.97,8.1,9.5,14.17
2,21.48,12.97,12.97,8.7,9.6,14.22
3,21.48,12.97,12.96,7.5,10.4,14.2
4,21.5,12.96,12.97,10.4,7.7,14.18


In [32]:
center_bills = scaled - scaled.mean()
cov_matrix = np.cov(center_bills.T)
w, v = eig(cov_matrix)

In [5]:
bills.mean()

X_1    214.8960
X_2    130.1215
X_3    129.9565
X_4      9.4175
X_5     10.6505
X_6    140.4835
dtype: float64

In [33]:
example = scaled @ v

In [40]:
fig = go.Figure(
    go.Scatter(
        x=example.iloc[100:, 0],
        y=example.iloc[100:, 1],
        mode='markers',
        line_width=1,
        marker_size=9
    )
)

fig.add_trace(
    go.Scatter(
        x=example.iloc[:100, 0],
        y=example.iloc[:100, 1],
        mode='markers',
        line_width=1,
        marker_size=9
    )
)

fig.update_layout(
    title='First vs Second PC',
    title_font_size=22,
    xaxis=dict(
        title='PC1',
        title_font_size=18
    ),
    yaxis=dict(
        title='PC2',
        title_font_size=18
    ),
    margin=dict(
        l=40,
        r=10,
        b=10,
        t=40,
    ),
    height=600,
    width=800,
    showlegend=False,
    template='plotly_white',
)

fig.show()