In [2]:
import io
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src import timed

In [14]:
buffer = io.StringIO('''
product,quantity,price
apple,10,0.5
banana,5,0.3
apple,4,0.5
banana,7,0.3
orange,8,0.7
'''.lstrip())

In [3]:
buffer

<_io.StringIO at 0x7ba90412f1c0>

In [None]:
# brodcasting

#regular python
x = [1, 2, 3, 4, 5]
x * 3

# numpay array

arr = np.array([1,2,3,4,5])
arr * 3
np.array([1,2,3,4,5]) * np.array([1,2,3,4,5])

#### What product has the highest revenue(quantity * price)?

In [19]:
from csv import DictReader, reader
from collections import defaultdict, Counter

product_revenues = defaultdict(int)

buffer = io.StringIO('''
product,quantity,price
apple,10,0.5
banana,5,0.3
apple,4,0.5
banana,7,0.3
orange,8,0.7
'''.lstrip())
product_count = Counter()

In [20]:
for row in DictReader(buffer):
    product_revenues[row["product"]] += (float(row["quantity"]) * float(row["price"]))
    product_count[row["product"]] += 1

In [18]:
Counter(product_revenues).most_common(1)


[('apple', 7.0)]

#### How often does each product appear?

In [24]:
product_count

Counter({'apple': 2, 'banana': 2, 'orange': 1})

In [48]:
buffer = io.StringIO('''
product,quantity,price
apple,10,0.5
banana,5,0.3
apple,4,0.5
banana,7,0.3
orange,8,0.7
'''.lstrip())

df = pd.read_csv(buffer)

# df["quantity"] * df["price"]
# df.groupby("product")["quantity"].sum()
df["revenue"] = df["quantity"] * df["price"]
product_revenues_ = df.groupby("product")["revenue"].sum()
# idx przy grupby są nazwami grup
# type(product_revenues_) # Series
print(f'Product: {product_revenues_.idxmax()}, {product_revenues_[product_revenues_.idxmax()]}')
print(f"Frequency: {df.groupby(['product'])["product"].count()}")

Product: apple, 7.0
Frequency: product
apple     2
banana    2
orange    1
Name: product, dtype: int64


In [37]:
df

Unnamed: 0,product,quantity,price,revenue
0,apple,10,0.5,5.0
1,banana,5,0.3,1.5
2,apple,4,0.5,2.0
3,banana,7,0.3,2.1
4,orange,8,0.7,5.6



##Data Types

In [9]:
from sys import getsizeof

class T:
    pass

xs = [1 for _ in range(10_000)]
x = [T() for _ in range(10_000)]

print(sum(getsizeof(x_item) for x_item in x) /1024 /1024, sum(getsizeof(x_item) for x_item in xs) /1024 /1024)
print(pd.Series(xs).memory_usage(deep=True) / 1024 / 1024, sep="\n")

0.457763671875 0.26702880859375
0.07641983032226562


In [10]:
from numpy.random import default_rng

rng = default_rng(42)
np_xs = rng.normal(0, 10, size=10_000)
py_xs = np_xs.tolist()

with timed("python list.append"):
    for _ in range(10_000):
        py_xs.append(1)

with timed("python numpy.concatenate"):
    add_elements = []
    for _ in range(10_000):
        add_elements.append(1)

    np_xs = np.concatenate((np_xs, add_elements))

python list.append took 0.003888 seconds
python numpy.concatenate took 0.005261 seconds


In [16]:
df = pd.DataFrame({
    "bool": [True, False, True],
    "int": [1, 2, 3],
    "float": [1.1, 2.2, 3.3],
    'string': ['a', 'b', 'c'],
    "datetime": pd.date_range("2021-01-01", periods=3, freq="D"),
    "timedelta": pd.timedelta_range('1 day', periods=3),
    'categorical': pd.Categorical(['a', 'b', 'c']),
})
df


Unnamed: 0,bool,int,float,string,datetime,timedelta,categorical
0,True,1,1.1,a,2021-01-01,1 days,a
1,False,2,2.2,b,2021-01-02,2 days,b
2,True,3,3.3,c,2021-01-03,3 days,c


In [12]:
df.dtypes


bool                      bool
int                      int64
float                  float64
string                  object
datetime        datetime64[ns]
timedelta      timedelta64[ns]
categorical           category
object                  object
dtype: object

In [3]:
x = pd.Series([1, 2, 3, 4, 5, 'yolo'], dtype='string[pyarrow]')


In [4]:
x.dtypes

string[pyarrow]

In [31]:
df.to_csv("data.csv") # klasyczna metod zrzucania na dysk
df.to_parquet("file.parquet") # zrzucanie do pliku parquet, bardziej koszerne
os.path.getsize("data.csv"), os.path.getsize("file.parquet")


(160, 4867)

In [40]:
s1 = pd.Series([ 1,  2,  3,  4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=[     'b',      'd', 'e', 'f'])
s1


a    1
b    2
c    3
d    4
dtype: int64

In [44]:
s1.reindex(["a", "b", "c", "d", "e", "f"])

a    1.0
b    2.0
c    3.0
d    4.0
e    NaN
f    NaN
dtype: float64

In [48]:
s2.reindex(set(list(s1.index) + list(s2.index))).fillna(0)


e    30.0
f    40.0
c     0.0
d    20.0
b    10.0
a     0.0
dtype: float64

In [35]:
s1 + s2

a     NaN
b    12.0
c     NaN
d    24.0
e     NaN
f     NaN
dtype: float64

In [38]:
s1.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [None]:
s1.

In [39]:
s2.index

Index(['b', 'd', 'e', 'f'], dtype='object')

#### Wczytywanie danych

In [50]:
s1.add(s2, fill_value=0) #jeażeli sumujemy dwie serie, które nie mają wspólnych indexów, to w wyniku dostaniemy NaN, ale można dodać domyślną wartość 0

a     1.0
b    12.0
c     3.0
d    24.0
e    30.0
f    40.0
dtype: float64

In [72]:
buffer = io.StringIO('''
customer_id,date,amount,country
001,2024/01/01,$100.50,US
002,2024-01-05,$50.25,UK
,2024-01-08,,$50.75
004,2024-01-11,$75,Canada
005,,,$0.00
006,2024-01-14,$30.10,UK
007,2024-01-16,$45.00,US
008,2024-01-20,$60.75,
''')


df = (
    pd.read_csv(buffer)
    .assign(
        date=lambda d: pd.to_datetime(d['date'], format='mixed').dt.day_name(),
        amount=lambda d: pd.to_numeric(d["amount"].fillna(d["country"]).str.lstrip("$")),
        country=lambda d: d["country"].astype("string[pyarrow]").mask(lambda s: s.str.startswith("$")),
)
)

df


Unnamed: 0,customer_id,date,amount,country
0,1.0,Monday,100.5,US
1,2.0,Friday,50.25,UK
2,,Monday,50.75,
3,4.0,Thursday,75.0,Canada
4,5.0,,0.0,
5,6.0,Sunday,30.1,UK
6,7.0,Tuesday,45.0,US
7,8.0,Saturday,60.75,


In [66]:
df.dtypes

customer_id    float64
date            object
amount         float64
country         object
dtype: object

In [95]:
from pandas import read_csv, DataFrame
from io import StringIO
from textwrap import dedent

buffer = StringIO(dedent('''
    device,upgrade_dates
    device-1,2000-01-01,2000-02-01,2000-03-01
    device-2,2000-01-01,2000-04-01
    device-3,2000-01-01,2000-03-01,2000-05-01,2000-10-01
    device-4,2000-01-01,2000-07-01,2000-09-01
''').strip())

# /df = pd.read_csv(buffer, on_bad_lines="skip")

def process(f):
    f = (ln.strip().split(",") for ln in f)
    yield next(f)
    for line in f:
        dev, *dates = line
        yield dev, dates

headers , *data = process(buffer)

# headers = []
# data = []
# for index,ln in enumerate(buffer):
#     row = ln.strip().split(",")
#     if index == 0:
#         headers.extend(row) # pierwsza linia to nagłówki
#     else:
#         device, *dates = row
#         data.append([device, dates])
#
# print(headers, data)

df = pd.DataFrame(data, columns=headers).explode("upgrade_dates")# explode rozbija listę na pojedyncze wartości

df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,device,upgrade_dates
0,device-1,2000-01-01
1,device-1,2000-02-01
2,device-1,2000-03-01
3,device-2,2000-01-01
4,device-2,2000-04-01
5,device-3,2000-01-01
6,device-3,2000-03-01
7,device-3,2000-05-01
8,device-3,2000-10-01
9,device-4,2000-01-01
