In [202]:
# 정수는 대부분의 컴퓨터에서 부동소수점이 아니라 정수로 데이터셋을 가져오는 것이 이상적이다. 또 다른 예로 데이터셋에 날짜가 있다면 
# 문자열이 아닌 날짜/시간으로 데이터셋을 가져오는 것이 좋다.
# 날짜/시간 유형은 문자열과 달리 날짜와 시간에 특화된 연산자를 제공하기 때문이다.
# 더 빠르게 데이터를 필터링할 수 있도록 열 데이터를 다른 유형으로 변환하여 메모리 사용량을 줄이는 몇가지 방법을 살펴보자.

import pandas as pd

In [203]:
# 모든 열에는 결측값이 있다. 심지어 마지막 행은 NaN으로만 구성되어 있다.
# 이와 같은 불완전한 데이터는 현실 세계에서 흔히 볼 수 있따. 데이터셋은 빈행, 빈열은 포함할 수 있다.
# 
temp = pd.read_csv("employees.csv")

In [204]:
temp["Salary"].nunique()

994

In [205]:
employees = pd.read_csv("employees.csv", parse_dates = ["Start Date"])

In [206]:
# info 메서드를 호출하여 각 열의 데이터 유형, 결측값의 개수를 목록으로 확인할 수 있다. 
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    object        
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      999 non-null    float64       
 4   Mgmt        933 non-null    object        
 5   Team        957 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 47.0+ KB


In [207]:
employees["Mgmt"].astype(bool)

0        True
1        True
2       False
3        True
4        True
        ...  
996     False
997     False
998     False
999      True
1000     True
Name: Mgmt, Length: 1001, dtype: bool

In [208]:
employees["Mgmt"] = employees["Mgmt"].astype(bool)

In [209]:
employees.tail()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales
1000,,,NaT,,True,


In [210]:
# employees의 메모리 사용량을 많이 줄였다.
# 판다스는 NaN값을 처리하기 위해정수를 부동소수점을 로 변환한다.
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    object        
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      999 non-null    float64       
 4   Mgmt        1001 non-null   bool          
 5   Team        957 non-null    object        
dtypes: bool(1), datetime64[ns](1), float64(1), object(3)
memory usage: 40.2+ KB


In [211]:
# 판다스는 NaN값을 정수로 변환할 수 없다. NaN값을 상수 값으로 대체하면 이문제를 해결할 수 있다.
employees["Salary"].astype(int)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [212]:
# fillna를 사용하여 대체가 가능하다. 
# fillna를 사용할 때 임으로 값을 선택하여 null값을 대체하면 데이터가 왜곡될 수 있다.
employees["Salary"].fillna(0)

0            0.0
1        61933.0
2       130590.0
3       138705.0
4       101004.0
          ...   
996      42392.0
997      96914.0
998      60500.0
999     129949.0
1000         0.0
Name: Salary, Length: 1001, dtype: float64

In [213]:
# salary열에 결측값이 없으므로 astype 메세드를 사용하여 해당 값을 정수로 변환할 수 있다.
# 결측값이 있으면 정수로 변환안되는 것 숙지하자!
employees["Salary"].fillna(0).astype(int)

0            0
1        61933
2       130590
3       138705
4       101004
         ...  
996      42392
997      96914
998      60500
999     129949
1000         0
Name: Salary, Length: 1001, dtype: int64

In [214]:
employees["Salary"] =  employees["Salary"].fillna(0).astype(int)

In [215]:
employees.nunique()

First Name    200
Gender          2
Start Date    971
Salary        995
Mgmt            2
Team           10
dtype: int64

In [216]:
employees["Gender"].astype("category")

0         Male
1         Male
2       Female
3          NaN
4         Male
         ...  
996       Male
997       Male
998       Male
999       Male
1000       NaN
Name: Gender, Length: 1001, dtype: category
Categories (2, object): ['Female', 'Male']

In [217]:
employees["Gender"] = employees["Gender"].astype("category")

In [218]:
employees["Team"] = employees["Team"].astype("category")

In [219]:
# 데이터프레임의 메모리 사용량을 50%이상
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    category      
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      1001 non-null   int64         
 4   Mgmt        1001 non-null   bool          
 5   Team        957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](1), int64(1), object(1)
memory usage: 27.0+ KB


In [220]:
marias = employees["First Name"] == "Maria"
employees[marias]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
2,Maria,Female,NaT,130590,False,Finance
198,Maria,Female,1990-12-27,36067,True,Product
815,Maria,,1986-01-18,106562,False,HR
844,Maria,,1985-06-19,148857,False,Legal
936,Maria,Female,2003-03-14,96250,False,Business Dev
984,Maria,Female,2011-10-15,43455,False,Engineering


In [221]:
"Finance" != "Engineering"

True

In [222]:
employees.loc[employees["Team"] != "Finance"]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
6,Ruby,Female,1987-08-17,65476,True,Product
...,...,...,...,...,...,...
995,Henry,,2014-11-23,132483,False,Distribution
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev
999,Albert,Male,2012-05-15,129949,True,Sales


In [223]:
employees[employees["Mgmt"]]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
6,Ruby,Female,1987-08-17,65476,True,Product
...,...,...,...,...,...,...
992,Anthony,Male,2011-10-16,112769,True,Finance
993,Tina,Female,1997-05-15,56450,True,Engineering
994,George,Male,2013-06-21,98874,True,Marketing
999,Albert,Male,2012-05-15,129949,True,Sales


In [224]:
high_earners = employees["Gender"] > 100000


TypeError: Unordered Categoricals can only compare equality or not

In [225]:
employees[high_earners]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
12,Brandon,Male,1980-12-01,112807,True,HR
...,...,...,...,...,...,...
994,George,Male,2013-06-21,98874,True,Marketing
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev


In [226]:
is_female = employees["Gender"] == "Female"

In [227]:
is_biz_dev = employees["Team"] == "Business Dev"

In [228]:
employees[is_female & is_biz_dev].head(5)

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
9,Frances,Female,2002-08-08,139852,True,Business Dev
33,Jean,Female,1993-12-18,119082,False,Business Dev
36,Rachel,Female,2009-02-16,142032,False,Business Dev
38,Stephanie,Female,1986-09-13,36844,True,Business Dev
61,Denise,Female,2001-11-06,106862,False,Business Dev


In [229]:
is_manager = employees["Mgmt"]
employees[is_female & is_biz_dev & is_manager].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
9,Frances,Female,2002-08-08,139852,True,Business Dev
38,Stephanie,Female,1986-09-13,36844,True,Business Dev
66,Nancy,Female,2012-12-15,125250,True,Business Dev
92,Linda,Female,2000-05-25,119009,True,Business Dev
111,Bonnie,Female,1999-12-17,42153,True,Business Dev


In [230]:
earnings_below_40k = employees["Salary"] < 40000
started_after_2015 = employees["Start Date"] > "2015-01-01"

In [231]:
employees.loc[earnings_below_40k | started_after_2015].tail()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
958,Gloria,Female,1987-10-24,39833,False,Engineering
964,Bruce,Male,1980-05-07,35802,True,Sales
967,Thomas,Male,2016-03-12,105681,False,Engineering
989,Justin,,1991-02-10,38344,False,Legal
1000,,,NaT,0,True,


In [232]:
my_series = pd.Series([True, False, True])
my_series

0     True
1    False
2     True
dtype: bool

In [233]:
~my_series

0    False
1     True
2    False
dtype: bool

In [234]:
employees[employees["Salary"] < 100000].count()

First Name    556
Gender        502
Start Date    591
Salary        592
Mgmt          592
Team          566
dtype: int64

In [235]:
employees.describe()

Unnamed: 0,Salary
count,1001.0
mean,90474.398601
std,33154.870101
min,0.0
25%,62371.0
50%,90370.0
75%,118736.0
max,149908.0


In [236]:
employees["Salary"].nunique()

995

In [239]:
employees[~(employees["Salary"] >= 100000)]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
6,Ruby,Female,1987-08-17,65476,True,Product
7,,Female,2015-07-20,45906,True,Finance
8,Angela,Female,2005-11-22,95570,True,Engineering
...,...,...,...,...,...,...
994,George,Male,2013-06-21,98874,True,Marketing
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev


In [240]:
# 이러한 방법은 확장성이 없다 15명이면 일일히 다하게?
sales = employees["Team"] == "Sales"
legal = employees["Team"] == "Legal"
marketing = employees["Team"] == "Marketing"
employees[sales | legal | marketing]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
5,Dennis,Male,1987-04-18,115163,False,Legal
11,Julie,Female,1997-10-26,102508,True,Legal
13,Gary,Male,2008-01-27,109831,False,Sales
20,Lois,,1995-04-22,64714,True,Legal
...,...,...,...,...,...,...
986,Donna,Female,1982-11-26,82871,False,Marketing
989,Justin,,1991-02-10,38344,False,Legal
991,Rose,Female,2002-08-25,134505,True,Marketing
994,George,Male,2013-06-21,98874,True,Marketing


In [None]:
# 확장성이 있는 방법이 핋요하다면 반복가능한 요소 (리스트, 튜플, 시리즈 등)을 인수로 받아 불리언 시리즈를 반환하는 Isin aptjemfmf tkdyd

In [243]:
all_star_teams = ["Sales", "Legal", "Marketing"]
on_all_teams = employees["Team"].isin(all_star_teams)
employees[on_all_teams].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
5,Dennis,Male,1987-04-18,115163,False,Legal
11,Julie,Female,1997-10-26,102508,True,Legal
13,Gary,Male,2008-01-27,109831,False,Sales
20,Lois,,1995-04-22,64714,True,Legal


In [246]:
between_80k_and_90k = employees["Salary"].between(80000,90000)
employees[between_80k_and_90k]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
19,Donna,Female,2010-07-22,81014,False,Product
31,Joyce,,2005-02-20,88657,False,Product
35,Theresa,Female,2006-10-10,85182,False,Sales
45,Roger,Male,1980-04-17,88010,True,Sales
54,Sara,Female,2007-08-15,83677,False,Engineering
...,...,...,...,...,...,...
930,Nancy,Female,2001-09-10,85213,True,Marketing
956,Beverly,Female,1986-10-17,80838,False,Engineering
963,Ann,Female,1994-09-23,89443,True,Sales
985,Stephen,,1983-07-10,85668,False,Legal


In [249]:
# 날짜 시간을 필터링하려면 시간범위의 시작과 끝날짜를 문자열로 지정하면된다. left와 right
eighties_folk = employees["Start Date"].between(
        left = "1980-01-01",
        right = "1990-01-01"
        )

employees[eighties_folk].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
5,Dennis,Male,1987-04-18,115163,False,Legal
6,Ruby,Female,1987-08-17,65476,True,Product
10,Louise,Female,1980-08-12,63241,True,
12,Brandon,Male,1980-12-01,112807,True,HR
17,Shawn,Male,1986-12-07,111737,False,Product


In [253]:
# 문자열 열에도 between 메서드를 적용할 수 있다.
# 
name_starts_with_r = employees["First Name"].between("R", "S")
employees[name_starts_with_r].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
6,Ruby,Female,1987-08-17,65476,True,Product
36,Rachel,Female,2009-02-16,142032,False,Business Dev
45,Roger,Male,1980-04-17,88010,True,Sales
67,Rachel,Female,1999-08-16,51178,True,Finance
78,Robin,Female,1983-06-04,114797,True,Sales


In [255]:
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,NaT,130590,False,Finance
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT


In [257]:
employees["Team"].isnull().head()

0    False
1     True
2    False
3    False
4    False
Name: Team, dtype: bool

In [259]:
employees["Start Date"].isnull().head()

0    False
1    False
2     True
3    False
4    False
Name: Start Date, dtype: bool

In [261]:
employees["Team"].notnull().head() # notnull이 좀 더 명확함

0     True
1    False
2     True
3     True
4     True
Name: Team, dtype: bool

In [266]:
no_team = employees["Team"].isnull()
employees[no_team]

# 조건을 만들어 변수에 넣고 
# 출력을 한다.

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
1,Thomas,Male,1996-03-31,61933,True,
10,Louise,Female,1980-08-12,63241,True,
23,,Male,2012-06-14,125792,True,
32,,Male,1998-08-21,122340,True,
91,James,,2005-01-26,128771,False,
109,Christopher,Male,2000-04-22,37919,False,
139,,Female,1990-10-03,132373,True,
199,Jonathan,Male,2009-07-17,130581,True,
258,Michael,Male,2002-01-24,43586,False,
290,Jeremy,Male,1988-06-14,129460,True,


In [270]:
has_name = employees["First Name"].notnull()
employees[has_name]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,NaT,130590,False,Finance
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
...,...,...,...,...,...,...
995,Henry,,2014-11-23,132483,False,Distribution
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev


# Null값 다루기 

In [271]:
employees = pd.read_csv(
            "employees.csv", parse_dates = ["Start Date"]        
)

In [273]:
employees

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
...,...,...,...,...,...,...
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales


In [275]:
employees.dropna()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
4,Larry,Male,1998-01-24,101004.0,True,IT
5,Dennis,Male,1987-04-18,115163.0,False,Legal
6,Ruby,Female,1987-08-17,65476.0,True,Product
8,Angela,Female,2005-11-22,95570.0,True,Engineering
9,Frances,Female,2002-08-08,139852.0,True,Business Dev
...,...,...,...,...,...,...
994,George,Male,2013-06-21,98874.0,True,Marketing
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev


In [277]:
# 모든 value가 null인 값만 제거한다.
employees.dropna(how = "all")

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
...,...,...,...,...,...,...
995,Henry,,2014-11-23,132483.0,False,Distribution
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev


In [279]:
employees.dropna(how = "any")

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
4,Larry,Male,1998-01-24,101004.0,True,IT
5,Dennis,Male,1987-04-18,115163.0,False,Legal
6,Ruby,Female,1987-08-17,65476.0,True,Product
8,Angela,Female,2005-11-22,95570.0,True,Engineering
9,Frances,Female,2002-08-08,139852.0,True,Business Dev
...,...,...,...,...,...,...
994,George,Male,2013-06-21,98874.0,True,Marketing
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev


In [281]:
employees.dropna(subset = ["Gender"]).tail()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
994,George,Male,2013-06-21,98874.0,True,Marketing
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales


In [283]:
employees.dropna(subset = ["Start Date", "Salary"]).head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
1,Thomas,Male,1996-03-31,61933.0,True,
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
5,Dennis,Male,1987-04-18,115163.0,False,Legal
6,Ruby,Female,1987-08-17,65476.0,True,Product


In [291]:
# thresh는 판다스가 행을 유지하는 조건으로 최소 몇 개의 null이 아닌 값을 가져야 하는지 
employees.dropna(how='any')
employees.dropna(thresh=4)

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
...,...,...,...,...,...,...
995,Henry,,2014-11-23,132483.0,False,Distribution
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev


In [293]:
employees["Team"].head()

0    Marketing
1          NaN
2      Finance
3      Finance
4           IT
Name: Team, dtype: object

In [294]:
employees["Team"].duplicated().head()

0    False
1    False
2    False
3     True
4    False
Name: Team, dtype: bool

In [295]:
# 기본 인수인 First는 중복값이 첫 번째로 나타난 인덱스를 False로 표시하여 값을 유지한다. 
employees["Team"].duplicated(keep ="first").head()

0    False
1    False
2    False
3     True
4    False
Name: Team, dtype: bool

In [296]:
# keep 매개변수에 last 문자열을 전달한다. 마지막으로 나타나는 값을 중복되지 않은 값으로 표시하도록 요청
employees["Team"].duplicated(keep = "last")

0        True
1        True
2        True
3        True
4        True
        ...  
996     False
997     False
998     False
999     False
1000    False
Name: Team, Length: 1001, dtype: bool

In [297]:
employees.drop_duplicates()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
...,...,...,...,...,...,...
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales


In [298]:
employees.drop_duplicates(subset = ["Team"])

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
5,Dennis,Male,1987-04-18,115163.0,False,Legal
6,Ruby,Female,1987-08-17,65476.0,True,Product
8,Angela,Female,2005-11-22,95570.0,True,Engineering
9,Frances,Female,2002-08-08,139852.0,True,Business Dev
12,Brandon,Male,1980-12-01,112807.0,True,HR
13,Gary,Male,2008-01-27,109831.0,False,Sales


In [301]:
# 이 때 유지되는 행은 데이터셋의 뒤쪽에 있을 가능성이 높다. 'Alice'는 데이터셋에 있는 'HR'팀의 마지막 직원이고 justin은 Legal 팀의 마지막 지원이다.
employees.drop_duplicates(subset = ["Team"], keep = "last")

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
988,Alice,Female,2004-10-05,47638.0,False,HR
989,Justin,,1991-02-10,38344.0,False,Legal
990,Robin,Female,1987-07-24,100765.0,True,IT
993,Tina,Female,1997-05-15,56450.0,True,Engineering
994,George,Male,2013-06-21,98874.0,True,Marketing
995,Henry,,2014-11-23,132483.0,False,Distribution
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales


In [302]:
# 독자적으로 혼자 존재하는 이름... 2개이상이면 다 삭제
employees.drop_duplicates(subset = ["First Name"], keep = False)

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
5,Dennis,Male,1987-04-18,115163.0,False,Legal
8,Angela,Female,2005-11-22,95570.0,True,Engineering
33,Jean,Female,1993-12-18,119082.0,False,Business Dev
190,Carol,Female,1996-03-19,57783.0,False,Finance
291,Tammy,Female,1984-11-11,132839.0,True,IT
495,Eugene,Male,1984-05-24,81077.0,False,Sales
688,Brian,Male,2007-04-07,93901.0,True,Legal
832,Keith,Male,2003-02-12,120672.0,False,Legal
887,David,Male,2009-12-05,92242.0,False,Legal


In [314]:
name_is_douglas = employees["First Name"] == "Douglas"
is_male = employees["Gender"] == "Male"
employees[name_is_douglas & is_male]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
217,Douglas,Male,1999-09-03,83341.0,True,IT
322,Douglas,Male,2002-01-08,41428.0,False,Product
835,Douglas,Male,2007-08-04,132175.0,False,Engineering


# 1. 제한된 메모리 샤용량과 사용성을 최대화하기 위해 데이터셋을 최적화해라

In [319]:
netflix = pd.read_csv(("netflix.csv"), parse_dates = ["date_added"])
netflix

Unnamed: 0,title,director,date_added,type
0,Alias Grace,,2017-11-03,TV Show
1,A Patch of Fog,Michael Lennox,2017-04-15,Movie
2,Lunatics,,2019-04-19,TV Show
3,Uriyadi 2,Vijay Kumar,2019-08-02,Movie
4,Shrek the Musical,Jason Moore,2013-12-29,Movie
...,...,...,...,...
5832,The Pursuit,John Papola,2019-08-07,Movie
5833,Hurricane Bianca,Matt Kugelman,2017-01-01,Movie
5834,Amar's Hands,Khaled Youssef,2019-04-26,Movie
5835,Bill Nye: Science Guy,Jason Sussberg,2018-04-25,Movie


In [320]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5837 entries, 0 to 5836
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   title       5837 non-null   object        
 1   director    3936 non-null   object        
 2   date_added  5195 non-null   datetime64[ns]
 3   type        5837 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 182.5+ KB


In [321]:
netflix.nunique()

title         5780
director      3024
date_added    1092
type             2
dtype: int64

In [322]:
netflix["type"] = netflix["type"].astype("category")

In [324]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5837 entries, 0 to 5836
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   title       5837 non-null   object        
 1   director    3936 non-null   object        
 2   date_added  5195 non-null   datetime64[ns]
 3   type        5837 non-null   category      
dtypes: category(1), datetime64[ns](1), object(2)
memory usage: 142.8+ KB


In [325]:
netflix

Unnamed: 0,title,director,date_added,type
0,Alias Grace,,2017-11-03,TV Show
1,A Patch of Fog,Michael Lennox,2017-04-15,Movie
2,Lunatics,,2019-04-19,TV Show
3,Uriyadi 2,Vijay Kumar,2019-08-02,Movie
4,Shrek the Musical,Jason Moore,2013-12-29,Movie
...,...,...,...,...
5832,The Pursuit,John Papola,2019-08-07,Movie
5833,Hurricane Bianca,Matt Kugelman,2017-01-01,Movie
5834,Amar's Hands,Khaled Youssef,2019-04-26,Movie
5835,Bill Nye: Science Guy,Jason Sussberg,2018-04-25,Movie


In [330]:
netflix.loc[netflix["title"] == "Limitless"]

Unnamed: 0,title,director,date_added,type
1559,Limitless,Neil Burger,2019-05-16,Movie
2564,Limitless,,2016-07-01,TV Show
4579,Limitless,Vrinda Samartha,2019-10-01,Movie


In [334]:
robert_rodriguez_made = netflix.loc[netflix["director"] == "Robert Rodriguez"]
netflix.loc[(netflix["type"] == "Movie") & (netflix["director"] == "Robert Rodriguez")]


Unnamed: 0,title,director,date_added,type
1384,Spy Kids: All the Time in the World,Robert Rodriguez,2019-02-19,Movie
1416,Spy Kids 3: Game Over,Robert Rodriguez,2019-04-01,Movie
1460,Spy Kids 2: The Island of Lost Dreams,Robert Rodriguez,2019-03-08,Movie
2890,Sin City,Robert Rodriguez,2019-10-01,Movie
3836,Shorts,Robert Rodriguez,2019-07-01,Movie
3883,Spy Kids,Robert Rodriguez,2019-04-01,Movie


In [340]:
date_0731 = netflix["date_added"] == "2019-07-31"
robert_altman = netflix["director"] == "Robert Altman"
netflix[date_0731 | robert_altman]

Unnamed: 0,title,director,date_added,type
611,Popeye,Robert Altman,2019-11-24,Movie
1028,The Red Sea Diving Resort,Gideon Raff,2019-07-31,Movie
1092,Gosford Park,Robert Altman,2019-11-01,Movie
3473,Bangkok Love Stories: Innocence,,2019-07-31,TV Show
5117,Ramen Shop,Eric Khoo,2019-07-31,Movie


In [361]:

directors = ["Orson Welles","Aditya Kripalani","Sam Raimi"]
target_director = netflix["director"].isin(directors)
netflix[target_director]


Unnamed: 0,title,director,date_added,type
946,The Stranger,Orson Welles,2018-07-19,Movie
1870,The Gift,Sam Raimi,2019-11-20,Movie
3706,Spider-Man 3,Sam Raimi,2019-11-01,Movie
4243,Tikli and Laxmi Bomb,Aditya Kripalani,2018-08-01,Movie
4475,The Other Side of the Wind,Orson Welles,2018-11-02,Movie
5115,Tottaa Pataaka Item Maal,Aditya Kripalani,2019-06-25,Movie


In [346]:
netflix_date = netflix["date_added"].between(
        left = "2019-05-01",
        right = "2019-06-01"
)
netflix[netflix_date]

Unnamed: 0,title,director,date_added,type
29,Chopsticks,Sachin Yardi,2019-05-31,Movie
60,Away From Home,,2019-05-08,TV Show
82,III Smoking Barrels,Sanjib Dey,2019-06-01,Movie
108,Jailbirds,,2019-05-10,TV Show
124,Pegasus,Han Han,2019-05-31,Movie
...,...,...,...,...
5671,Satan & Adam,V. Scott Balcerek,2019-06-01,Movie
5675,Rim of the World,McG,2019-05-24,Movie
5677,Malibu Rescue,Savage Steve Holland,2019-05-13,Movie
5739,Mission Istaanbul: Darr Ke Aagey Jeet Hai,Apoorva Lakhia,2019-05-16,Movie


In [351]:
netflix.dropna(subset = "director", how = "all")

Unnamed: 0,title,director,date_added,type
1,A Patch of Fog,Michael Lennox,2017-04-15,Movie
3,Uriyadi 2,Vijay Kumar,2019-08-02,Movie
4,Shrek the Musical,Jason Moore,2013-12-29,Movie
5,Schubert In Love,Lars Büchel,2018-03-01,Movie
6,We Have Always Lived in the Castle,Stacie Passon,2019-09-14,Movie
...,...,...,...,...
5830,Bibi & Tina,Detlev Buck,2017-04-15,Movie
5832,The Pursuit,John Papola,2019-08-07,Movie
5833,Hurricane Bianca,Matt Kugelman,2017-01-01,Movie
5834,Amar's Hands,Khaled Youssef,2019-04-26,Movie


In [356]:
netflix["date_added"].nunique

<bound method IndexOpsMixin.nunique of 0      2017-11-03
1      2017-04-15
2      2019-04-19
3      2019-08-02
4      2013-12-29
          ...    
5832   2019-08-07
5833   2017-01-01
5834   2019-04-26
5835   2018-04-25
5836          NaT
Name: date_added, Length: 5837, dtype: datetime64[ns]>

In [358]:
netflix.drop_duplicates(subset = ["date_added"], keep = False)

Unnamed: 0,title,director,date_added,type
4,Shrek the Musical,Jason Moore,2013-12-29,Movie
12,Without Gorky,Cosima Spender,2017-05-31,Movie
30,Anjelah Johnson: Not Fancy,Jay Karas,2015-10-02,Movie
38,One Last Thing,Tim Rouhana,2019-08-25,Movie
70,Marvel's Iron Man & Hulk: Heroes United,Leo Riley,2014-02-16,Movie
...,...,...,...,...
5748,Menorca,John Barnard,2017-08-27,Movie
5749,Green Room,Jeremy Saulnier,2018-11-12,Movie
5788,Chris Brown: Welcome to My Life,Andrew Sandler,2017-10-07,Movie
5789,A Very Murray Christmas,Sofia Coppola,2015-12-04,Movie
