In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request 
from PIL import Image 
from IPython.display import Image
from IPython.core.display import HTML 

## Series

In [18]:
example = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='example')
example

a    1
b    2
c    3
Name: example, dtype: int64

In [19]:
print(example.values)
print('-' * 30)
print(example.index)
print('-' * 30)
print(example.value_counts())
print('-' * 30)
example.values[0] = 100
example = example.rename({'c': 'd'}) # поменяли название строки 
print(example)
print('-' * 30)
print(example.iloc[0]) # обращение по номеру строки в таблице
print('-' * 30)
print(example)
print(example.loc['b']) # обращение по названию строки в таблице
example = example.reset_index(drop=False) # drop = False по дефолту => создается df со столбцом - прошлые индексы
print('-' * 30)
example

[1 2 3]
------------------------------
Index(['a', 'b', 'c'], dtype='object')
------------------------------
example
1    1
2    1
3    1
Name: count, dtype: int64
------------------------------
a    100
b      2
d      3
Name: example, dtype: int64
------------------------------
100
------------------------------
a    100
b      2
d      3
Name: example, dtype: int64
2
------------------------------


Unnamed: 0,index,example
0,a,100
1,b,2
2,d,3


## DataFrame

In [20]:
url = 'https://pythonru.com/wp-content/uploads/2020/05/struktura-dataframe.png'
display(Image(url=url))

In [21]:
df = pd.DataFrame(example)
df

Unnamed: 0,index,example
0,a,100
1,b,2
2,d,3


In [112]:
dict = {
    'a' : [1, 2, 3],
    'b' : [4, 5, 6],
    'c' : [7, 8, 9]
} # столбцы
df = pd.DataFrame(dict, index=[100, 200, 300])
df = df.sort_values(by=['b', 'a'], ascending=[False, False])
df

Unnamed: 0,a,b,c
300,3,6,9
200,2,5,8
100,1,4,7


In [23]:
data = {'color' : ['black', 'red', 'white'],
        'object' : ['pencil', 'paper', 'mug'],
        'price' : [1.2, 0.9, 1.7]}
frame2 = pd.DataFrame(data, columns=['object', 'price'])
frame2

Unnamed: 0,object,price
0,pencil,1.2
1,paper,0.9
2,mug,1.7


In [24]:
frame2['object'].loc[0:1]

0    pencil
1     paper
Name: object, dtype: object

In [25]:
frame2.index.name = 'id'
frame2.columns.name = 'item'
frame2['3price'] = 3 * frame2['price']
frame2['new'] = True
frame2

item,object,price,3price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,pencil,1.2,3.6,True
1,paper,0.9,2.7,True
2,mug,1.7,5.1,True


In [26]:
del frame2['new']
frame2

item,object,price,3price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,pencil,1.2,3.6
1,paper,0.9,2.7
2,mug,1.7,5.1


## Фильтрация

In [28]:
frame2[frame2.price < 1]

item,object,price,3price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,paper,0.9,2.7


In [29]:
a = np.array(frame2)
a

array([['pencil', 1.2, 3.5999999999999996],
       ['paper', 0.9, 2.7],
       ['mug', 1.7, 5.1]], dtype=object)

In [31]:
frame2.columns.name = 'ITEMS'
frame2.index.name = 'ID'
frame2.rename({0:'zero', 1:'one', 100:'there is no 100'})
frame2.rename(columns={'object':'obj'})

ITEMS,obj,price,3price
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,pencil,1.2,3.6
1,paper,0.9,2.7
2,mug,1.7,5.1


In [33]:
# трехмерный df
dict = {
    '1':{'a' : [1, 2, 3],
    'b' : [4, 5, 6],
    'c' : [7, 8, 9]},

    '2': {'a' : [1, 2, 3],
    'b' : [4, 5, 6],
    'c' : [7, 8, 9]}
}
df_3 = pd.DataFrame(dict)
print(df_3.iloc[1, 1])
print(df_3['1'])
df_3

[4, 5, 6]
a    [1, 2, 3]
b    [4, 5, 6]
c    [7, 8, 9]
Name: 1, dtype: object


Unnamed: 0,1,2
a,"[1, 2, 3]","[1, 2, 3]"
b,"[4, 5, 6]","[4, 5, 6]"
c,"[7, 8, 9]","[7, 8, 9]"


In [34]:
anime = pd.read_csv('anime.csv')
rating = pd.read_csv('rating.csv')

Откуда может потребоваться загрузка данных?

CSV/TXT - просто обычный файлик (данные разделены пробелами/табами/запятыми)

Словари (например, те же json-чики, если нам необходимо считать)

Excel - чаще всего приходиться выгружать, чем загружать, но тем не менее

БД - загрузки из баз данных (святое)

Какие есть функции для этого?

read_csv() - считываем данные с csv/txt/файла

read_excel() - считываем данные с Экселя (какой-то из листов)

read_json() - считываем json-файл в качестве таблицы

read_sql() - считываем SQL-запрос (про это скажем в лекции про коннекторы)

https://pandas.pydata.org/pandas-docs/stable/reference/io.html

In [35]:
anime.sample(3)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
4795,26053,Futari wa Precure: Splash☆Star Maji★Doki♥ Theater,"Action, Comedy, Fantasy, Magic, Shoujo",Movie,1,6.63,609
2281,79,Shuffle!,"Comedy, Drama, Ecchi, Fantasy, Harem, Magic, R...",TV,24,7.31,158772
6320,624,Mouse,"Action, Comedy, Ecchi, Harem, Shounen",TV,12,6.17,10979


параметры для считывания: \
header=None - наименования для колонок \
names - названия для колонок \
index_col - какую колонку в качестве индекса \
on_bad_lines=skip - плохие строки

Копирование

In [36]:
anime2 = anime.copy(deep=True) # отдельный объект

In [37]:
anime.head(3)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262


Сортировка

In [38]:
anime.sort_values(by=['episodes', 'rating'], ascending=[True, False]).head(3)


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
10464,33662,Taka no Tsume 8: Yoshida-kun no X-Files,"Comedy, Parody",Movie,1,10.0,13
9595,23005,Mogura no Motoro,Slice of Life,Movie,1,9.5,62
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630


## Срезы

In [39]:
anime[0:3]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262


Замечение: сначала лучше обращаться к названиям колонок, потом - к строкам

In [40]:
anime[['name', 'genre']][0:3]

Unnamed: 0,name,genre
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural"
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili..."
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S..."


In [41]:
anime = anime.set_index("name")
anime.head(3)

Unnamed: 0_level_0,anime_id,genre,type,episodes,rating,members
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Kimi no Na wa.,32281,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
Fullmetal Alchemist: Brotherhood,5114,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
Gintama°,28977,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262


In [42]:
print(anime['genre'][0:3])
anime['genre'][['Kimi no Na wa.', 'Fullmetal Alchemist: Brotherhood', 'Gintama°']] 

name
Kimi no Na wa.                                   Drama, Romance, School, Supernatural
Fullmetal Alchemist: Brotherhood    Action, Adventure, Drama, Fantasy, Magic, Mili...
Gintama°                            Action, Comedy, Historical, Parody, Samurai, S...
Name: genre, dtype: object


name
Kimi no Na wa.                                   Drama, Romance, School, Supernatural
Fullmetal Alchemist: Brotherhood    Action, Adventure, Drama, Fantasy, Magic, Mili...
Gintama°                            Action, Comedy, Historical, Parody, Samurai, S...
Name: genre, dtype: object

## loc, iloc

In [43]:
anime.iloc[0:3, 3:5] # как к массиву

Unnamed: 0_level_0,episodes,rating
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kimi no Na wa.,1,9.37
Fullmetal Alchemist: Brotherhood,64,9.26
Gintama°,51,9.25


In [44]:
anime.loc['Kimi no Na wa.':'Gintama°', 'anime_id':'rating']

Unnamed: 0_level_0,anime_id,genre,type,episodes,rating
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Kimi no Na wa.,32281,"Drama, Romance, School, Supernatural",Movie,1,9.37
Fullmetal Alchemist: Brotherhood,5114,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26
Gintama°,28977,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25


In [45]:
anime = anime.reset_index(names='name')

In [46]:
anime.head(3)

Unnamed: 0,name,anime_id,genre,type,episodes,rating,members
0,Kimi no Na wa.,32281,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,Fullmetal Alchemist: Brotherhood,5114,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,Gintama°,28977,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262


In [47]:
anime.shape

(12294, 7)

In [48]:
print(len(anime)) #сколько строк в массиве
print('-' * 30)
print(anime.info()) #описание для всех
print('-' * 30)
print(anime.describe()) # общее описание для нестроковых данных

12294
------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      12294 non-null  object 
 1   anime_id  12294 non-null  int64  
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None
------------------------------
           anime_id        rating       members
count  12294.000000  12064.000000  1.229400e+04
mean   14058.221653      6.473902  1.807134e+04
std    11455.294701      1.026746  5.482068e+04
min        1.000000      1.670000  5.000000e+00
25%     3484.250000      5.880000  2.250000e+02
50%    10260.500000      6.570000  1.550000e+03
75%    24794.500000      7.180000  9.437000e+03
max    34527.000000 

In [49]:
anime.episodes.unique()

array(['1', '64', '51', '24', '10', '148', '110', '13', '201', '25', '22',
       '75', '4', '26', '12', '27', '43', '74', '37', '2', '11', '99',
       'Unknown', '39', '101', '47', '50', '62', '33', '112', '23', '3',
       '94', '6', '8', '14', '7', '40', '15', '203', '77', '291', '120',
       '102', '96', '38', '79', '175', '103', '70', '153', '45', '5',
       '21', '63', '52', '28', '145', '36', '69', '60', '178', '114',
       '35', '61', '34', '109', '20', '9', '49', '366', '97', '48', '78',
       '358', '155', '104', '113', '54', '167', '161', '42', '142', '31',
       '373', '220', '46', '195', '17', '1787', '73', '147', '127', '16',
       '19', '98', '150', '76', '53', '124', '29', '115', '224', '44',
       '58', '93', '154', '92', '67', '172', '86', '30', '276', '59',
       '72', '330', '41', '105', '128', '137', '56', '55', '65', '243',
       '193', '18', '191', '180', '91', '192', '66', '182', '32', '164',
       '100', '296', '694', '95', '68', '117', '151', '130',

In [50]:
anime[anime.episodes== 'Unknown'].name.head(3)

74              One Piece
252       Detective Conan
615    Naruto: Shippuuden
Name: name, dtype: object

In [51]:
anime.genre.value_counts()

genre
Hentai                                                  823
Comedy                                                  523
Music                                                   301
Kids                                                    199
Comedy, Slice of Life                                   179
                                                       ... 
Adventure, Drama, Fantasy, Game, Sci-Fi                   1
Adventure, Demons, Fantasy, Historical                    1
Action, Comedy, Drama, Mecha, Music, Sci-Fi, Shounen      1
Action, Comedy, Fantasy, Mecha, Sci-Fi, Shounen           1
Hentai, Slice of Life                                     1
Name: count, Length: 3264, dtype: int64

In [52]:
anime['is_anime'] = True

In [53]:
anime.head(3)

Unnamed: 0,name,anime_id,genre,type,episodes,rating,members,is_anime
0,Kimi no Na wa.,32281,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,True
1,Fullmetal Alchemist: Brotherhood,5114,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,True
2,Gintama°,28977,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,True


In [54]:
anime.drop('is_anime', axis=1)
None

In [55]:
new_anime = anime.copy(deep=True)
b = pd.concat([anime, new_anime], axis=0, ignore_index=True)
c = pd.concat([anime, new_anime], axis=1, ignore_index=True)
print(b.shape, c.shape)
b.tail(3)

(24588, 8) (12294, 16)


Unnamed: 0,name,anime_id,genre,type,episodes,rating,members,is_anime
24585,Violence Gekiga David no Hoshi,5621,Hentai,OVA,4,4.88,219,True
24586,Violence Gekiga Shin David no Hoshi: Inma Dens...,6133,Hentai,OVA,1,4.98,175,True
24587,Yasuji no Pornorama: Yacchimae!!,26081,Hentai,Movie,1,5.46,142,True


In [56]:
anime.groupby(["type"]).agg({"rating": "mean", "name": "count"}).reset_index() #красота, можно использовать

Unnamed: 0,type,rating,name
0,Movie,6.318058,2348
1,Music,5.588996,488
2,ONA,5.643298,659
3,OVA,6.375221,3311
4,Special,6.523501,1676
5,TV,6.902299,3787


In [57]:
anime['adult'] = anime.genre.apply(lambda x: int('Hentai' in str(x)))