In [1]:
import pandas as pd

In [2]:
s = pd.Series([10,20,30],index=['a','b','c'])
print(s)

a    10
b    20
c    30
dtype: int64


In [3]:
s = pd.Series([1,2,3])
print(s + 10)
print(s * 2)

0    11
1    12
2    13
dtype: int64
0    2
1    4
2    6
dtype: int64


In [4]:
s = pd.Series([10,20,30,40])
mask = s > 15
print(s[mask])

1    20
2    30
3    40
dtype: int64


In [5]:
print(s[[2,0]])

2    30
0    10
dtype: int64


In [6]:
s = pd.Series([1,2,None,4])
s = s.fillna(s.mean())
print(s)

0    1.000000
1    2.000000
2    2.333333
3    4.000000
dtype: float64


In [7]:
import numpy as np
s = pd.Series([1,2,3])
print(s.apply(lambda x: x**2))
print(s.map({1:'a',2:'b',3:'c'}))

0    1
1    4
2    9
dtype: int64
0    a
1    b
2    c
dtype: object


In [8]:
s1 = pd.Series([1,2,3],index=['a','b','c'])
s2 = pd.Series([4,5,6],index=['b','c','d'])
print(s1+s2)

a    NaN
b    6.0
c    8.0
d    NaN
dtype: float64


In [9]:
index = pd.MultiIndex.from_tuples([('A',1),('A',2),('B',1)])
s = pd.Series([10,20,30],index=index)
print(s)

A  1    10
   2    20
B  1    30
dtype: int64


In [10]:
s = pd.Series([20,10,30])
print(s.nlargest(1))
print(s.nsmallest(1))

2    30
dtype: int64
1    10
dtype: int64


In [11]:
s = pd.Series([1,2,3,4,5])
print(s.rolling(window=3).mean())

0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
dtype: float64


In [12]:
s = pd.Series(['a','b','c'])
print(s.str.upper())

0    A
1    B
2    C
dtype: object


In [13]:
s = pd.Series(['2020-01-01','2020-02-01'])
s = pd.to_datetime(s)
print(s.dt.month)

0    1
1    2
dtype: int32


In [14]:
s = pd.Series([10,15,20])

def discount(x):
    return x * 0.09 if x > 12 else x


print(s.apply(discount))

0    10.00
1     1.35
2     1.80
dtype: float64


In [15]:
status = pd.Series(['new','expired','new','pending'])
status_map = {'new':0,'expired':1,'pending':2}
status_numeric = status.map(status_map)
print(status_numeric)

0    0
1    1
2    0
3    2
dtype: int64


In [16]:
s = pd.Series(['apple','banana','apple','orange'],dtype='category')
print(s)
print(s.cat.codes)

0     apple
1    banana
2     apple
3    orange
dtype: category
Categories (3, object): ['apple', 'banana', 'orange']
0    0
1    1
2    0
3    2
dtype: int8


In [17]:
tuples = [('A',2022),('A',2023),('B',2022),('B',2023)]

idx = pd.MultiIndex.from_tuples(tuples,names=['company','year'])
sales = pd.Series([500,600,450,700],index=idx)
print(sales)


print(sales['A'])

print(sales.loc[('A',2023)])

company  year
A        2022    500
         2023    600
B        2022    450
         2023    700
dtype: int64
year
2022    500
2023    600
dtype: int64
600


In [18]:
s = pd.Series([10,20,30,40],index=['a','a','b','b'])
grouped = s.groupby(level=0)
print(grouped.mean())

a    15.0
b    35.0
dtype: float64


In [19]:
print(grouped.transform(lambda x: x - x.mean()))

a   -5.0
a    5.0
b   -5.0
b    5.0
dtype: float64


In [20]:
print(grouped.filter(lambda x: x.mean() > 20))

b    30
b    40
dtype: int64


In [21]:
s = pd.Series(['red','blue','red','green'])
print(s.value_counts())
print(s.unique())
print(s.mode())

red      2
blue     1
green    1
Name: count, dtype: int64
['red' 'blue' 'green']
0    red
dtype: object


In [22]:
s = pd.Series([1,5,10,50])
print(s.clip(0,10))

ages = pd.Series([15,25,35,45,60])
bins = pd.cut(ages,bins=[0,18,34,60])

print(bins.value_counts())

0     1
1     5
2    10
3    10
dtype: int64
(34, 60]    3
(0, 18]     1
(18, 34]    1
Name: count, dtype: int64


In [23]:
s1 = pd.Series([1,2,3],index=['a','b','c'])
s2 = pd.Series([4,5,6],index=['b','c','d'])


print(s1.add(s2,fill_value=0))

a    1.0
b    6.0
c    8.0
d    6.0
dtype: float64


In [24]:
s = pd.Series([1,2,3,4,5])

print(s.rolling(window=3).mean())

print(s.expanding().mean())

0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
dtype: float64
0    1.0
1    1.5
2    2.0
3    2.5
4    3.0
dtype: float64


In [25]:
s = pd.Series([5,2,9,4])
print(s.agg(['min','max','median']))
print(s.agg({'minimum':'min','maximum':'max','sum':'sum'}))

min       2.0
max       9.0
median    4.5
dtype: float64
minimum     2
maximum     9
sum        20
dtype: int64


In [26]:
def add_two(x):
    return x + 2

def square(x):
    return x ** 2

s = pd.Series([1,2,3])

result = (s.pipe(add_two),s.pipe(square))
print(result)

(0    3
1    4
2    5
dtype: int64, 0    1
1    4
2    9
dtype: int64)


In [27]:
s = pd.Series(['category'] * 1000)
print(s.memory_usage(deep=True))

s = s.astype('category')
print(s.memory_usage(deep=True))

65132
1305


In [28]:
s = pd.Series([10,15,20])

def discount(x):
    return x * 0.9 if x > 12 else x

print(s.apply(discount))

status = pd.Series(['new','expired','new','pending'])
status_map = {'new':0,'expired':1,'pending':2}
status_numeric = status.map(status_map)
print(status_numeric)
print(status_map)

0    10.0
1    13.5
2    18.0
dtype: float64
0    0
1    1
2    0
3    2
dtype: int64
{'new': 0, 'expired': 1, 'pending': 2}


In [29]:
s = pd.Series(['apple','banana','apple','orange'],dtype='category')
print(s)
print(s.cat.codes)

0     apple
1    banana
2     apple
3    orange
dtype: category
Categories (3, object): ['apple', 'banana', 'orange']
0    0
1    1
2    0
3    2
dtype: int8


In [30]:
s = pd.Series([10,20,30,40],index=['a','a','b','b'])
grouped = s.groupby(level=0)
print(grouped.mean())

a    15.0
b    35.0
dtype: float64


In [31]:
s = pd.Series(['red','blue','red','green'])
print(s.value_counts())
print(s.unique())
print(s.mode())

red      2
blue     1
green    1
Name: count, dtype: int64
['red' 'blue' 'green']
0    red
dtype: object


In [32]:
s = pd.Series([1,5,10,50])
print(s.clip(0,10))


ages = pd.Series([15,25,35,45,60])
bins = pd.cut(ages,bins=[0,18,35,60])
print(bins.value_counts())

0     1
1     5
2    10
3    10
dtype: int64
(18, 35]    2
(35, 60]    2
(0, 18]     1
Name: count, dtype: int64


In [33]:
df = pd.DataFrame({
    'name':['Alice','Bob','Carol'],
    'score':[85,60,70]
})

mask = df['score'] > 65
print(df[mask])

    name  score
0  Alice     85
2  Carol     70


In [34]:
def adder(x,n):
    return x + n

s = pd.Series([1,2,3])
print(s.apply(adder,args=(6,)))

0    7
1    8
2    9
dtype: int64


In [35]:
sales = pd.Series([520,430,480,620,590],
                  index=pd.date_range('2024-07-01',periods=5))

In [36]:
print(sales)

2024-07-01    520
2024-07-02    430
2024-07-03    480
2024-07-04    620
2024-07-05    590
Freq: D, dtype: int64


In [37]:
print('Total sales:',sales.sum())
print('Average sales',sales.mean())
print('Day with highest sales:',sales.idxmax(),'=',sales.max())
print('Day with lowest sales:',sales.idxmin(),'=',sales.min())

Total sales: 2640
Average sales 528.0
Day with highest sales: 2024-07-04 00:00:00 = 620
Day with lowest sales: 2024-07-02 00:00:00 = 430


In [38]:
below_avg = sales[sales < sales.mean()]
print('Below-average sales days:\n',below_avg)

Below-average sales days:
 2024-07-01    520
2024-07-02    430
2024-07-03    480
Freq: D, dtype: int64


In [39]:
product_a = pd.Series([200, 190, 220, 260, 210],
                      index=sales.index)
product_b = pd.Series([320, 240, 260, 360, 380],
                      index=sales.index)

In [40]:
total_sales = product_a + product_b

In [41]:
print('Total sales by day:\n',total_sales)

Total sales by day:
 2024-07-01    520
2024-07-02    430
2024-07-03    480
2024-07-04    620
2024-07-05    590
Freq: D, dtype: int64


In [42]:
growth = total_sales.pct_change().fillna(0)

In [43]:
print('Percent change:\n',growth)

Percent change:
 2024-07-01    0.000000
2024-07-02   -0.173077
2024-07-03    0.116279
2024-07-04    0.291667
2024-07-05   -0.048387
Freq: D, dtype: float64


In [44]:
mov_avg = total_sales.rolling(window=3).mean()
print('3-day moving average:\n',mov_avg)

3-day moving average:
 2024-07-01           NaN
2024-07-02           NaN
2024-07-03    476.666667
2024-07-04    510.000000
2024-07-05    563.333333
Freq: D, dtype: float64


In [45]:
mean = total_sales.mean()
std = total_sales.std()
outlier = total_sales[total_sales > mean + 1.5 * std]
print('Sales outliers days:\n',outlier)

Sales outliers days:
 Series([], Freq: D, dtype: int64)


In [46]:
bins = [0,400,500,700]
labels = ['Low','Medium','High']
sales_levels = pd.cut(total_sales,bins=bins,labels=labels)
print('Sales level per day:\n',sales_levels)

Sales level per day:
 2024-07-01      High
2024-07-02    Medium
2024-07-03    Medium
2024-07-04      High
2024-07-05      High
Freq: D, dtype: category
Categories (3, object): ['Low' < 'Medium' < 'High']


In [47]:
df = pd.DataFrame({
    'product_a'
})

In [48]:


df = pd.DataFrame({
    'name':['Anna','Bob','Chirs','Pogi'],
    'age':[23,34,33,17]
})

In [49]:
df[df['age'] > 30]

Unnamed: 0,name,age
1,Bob,34
2,Chirs,33


In [50]:
df['age'] = 2024 - df['birth_year']

KeyError: 'birth_year'

In [None]:
df['is_adult'] = df['age'] >= 18


In [None]:
print(df)

    name  age  is_adult
0   Anna   23      True
1    Bob   34      True
2  Chirs   33      True
3   Pogi   17     False


In [None]:
tickets = [
    {'id': 1, 'priority': 'high', 'customer_tier': 'gold', 'category': 'billing', 'created_hour': 10, 'resolved': True, 'resolution_time_hours': 2},
    {'id': 2, 'priority': 'low', 'customer_tier': 'silver', 'category': 'technical', 'created_hour': 15, 'resolved': True, 'resolution_time_hours': 20},
    {'id': 3, 'priority': 'medium', 'customer_tier': 'gold', 'category': 'billing', 'created_hour': 8, 'resolved': True, 'resolution_time_hours': 5},
    {'id': 4, 'priority': 'high', 'customer_tier': 'bronze', 'category': 'account', 'created_hour': 22, 'resolved': False, 'resolution_time_hours': None},
    # ... more tickets
]

In [None]:
df = pd.DataFrame(tickets)
print(df.head())

   id priority customer_tier   category  created_hour  resolved  \
0   1     high          gold    billing            10      True   
1   2      low        silver  technical            15      True   
2   3   medium          gold    billing             8      True   
3   4     high        bronze    account            22     False   

   resolution_time_hours  
0                    2.0  
1                   20.0  
2                    5.0  
3                    NaN  


In [None]:
print(df['resolution_time_hours'].describe())

count     3.000000
mean      9.000000
std       9.643651
min       2.000000
25%       3.500000
50%       5.000000
75%      12.500000
max      20.000000
Name: resolution_time_hours, dtype: float64


In [None]:
print(df.groupby('priority')['resolution_time_hours'].mean())

priority
high       2.0
low       20.0
medium     5.0
Name: resolution_time_hours, dtype: float64


In [None]:
df['is_high_priority'] = df['priority'] == 'high'
df['created_at_night'] = df['created_hour'].apply(lambda x: x >= 20 or x < 6)

In [None]:
print(df)

   id priority customer_tier   category  created_hour  resolved  \
0   1     high          gold    billing            10      True   
1   2      low        silver  technical            15      True   
2   3   medium          gold    billing             8      True   
3   4     high        bronze    account            22     False   

   resolution_time_hours  is_high_priority  created_at_night  
0                    2.0              True             False  
1                   20.0             False             False  
2                    5.0             False             False  
3                    NaN              True              True  


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

train_df = df[df['resolved']]

X = pd.get_dummies(train_df[['priority','customer_tier','category','created_at_night']],drop_first=True)
y = train_df['resolution_time_hours']


X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train,y_train)
preds = model.predict(X_test)

In [None]:
input_features = pd.get_dummies(new_ticket_df[['priority', 'customer_tier', 'category', 'created_at_night']], drop_first=True)

# There may be missing columns compared to X_train (if, e.g., 'bronze' missing), so align:
input_features = input_features.reindex(columns=X_train.columns, fill_value=0)

# Step 4: Make the prediction
predicted_resolution_time = model.predict(input_features)[0]
print(f"Predicted resolution time: {predicted_resolution_time:.2f} hours")

{'priority': 'high', 'customer_tier': 'gold', 'category': 'billing', 'created_hour': 11}


In [51]:
data = [
    {'priority': 'high', 'customer_tier': 'gold', 'category': 'billing', 'created_hour': 10, 'resolution_time_hours': 2},
    {'priority': 'low', 'customer_tier': 'silver', 'category': 'technical', 'created_hour': 15, 'resolution_time_hours': 20},
    {'priority': 'medium', 'customer_tier': 'gold', 'category': 'billing', 'created_hour': 8, 'resolution_time_hours': 5},
    {'priority': 'high', 'customer_tier': 'bronze', 'category': 'account', 'created_hour': 22, 'resolution_time_hours': 7},
    # ...add more rows for realism
]

import pandas as pd

df = pd.DataFrame(data)

In [54]:
from sklearn.preprocessing import OneHotEncoder

categorical_features = ['priority','customer_tier','category']
oh = OneHotEncoder(sparse_output=False)
print(categorical_features )

['priority', 'customer_tier', 'category']


In [56]:
from sklearn.preprocessing import OneHotEncoder

categorical_features = ['priority', 'customer_tier', 'category']
oh = OneHotEncoder(sparse_output=False)

X_cat = oh.fit_transform(df[categorical_features])
cat_feature_names = oh.get_feature_names_out(categorical_features)
X_cat_df = pd.DataFrame(X_cat, columns=cat_feature_names)