# Exploring DataFrame and Series Objects

In [2]:
import pandas as pd

In [3]:
revenues = pd.Series([
    5555, 7000, 1980
])
revenues

0    5555
1    7000
2    1980
dtype: int64

In [4]:
revenues.values

array([5555, 7000, 1980], dtype=int64)

In [5]:
revenues.index

RangeIndex(start=0, stop=3, step=1)

In [6]:
city_revenues = pd.Series(
    [4200, 5555, 6500],
    index = ['Amsterdam', 'Toronto', 'Tokyo']
)

city_revenues

Amsterdam    4200
Toronto      5555
Tokyo        6500
dtype: int64

In [7]:
city_revenues['Toronto']

5555

In [8]:
city_employee_count_data = {'Amsterdam': 5, 'Tokyo': 8}
city_employee_count = pd.Series(city_employee_count_data)
city_employee_count

Amsterdam    5
Tokyo        8
dtype: int64

In [9]:
city_employee_count.keys()

Index(['Amsterdam', 'Tokyo'], dtype='object')

In [10]:
'Tokyo' in city_employee_count

True

In [11]:
city_data = pd.DataFrame({
    'revenue': city_revenues,
    'employee_count': city_employee_count,
})

city_data

Unnamed: 0,revenue,employee_count
Amsterdam,4200,5.0
Tokyo,6500,8.0
Toronto,5555,


In [12]:
city_data.axes

[Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object'),
 Index(['revenue', 'employee_count'], dtype='object')]

# Accessing data in a Series

In [13]:
city_data.axes

[Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object'),
 Index(['revenue', 'employee_count'], dtype='object')]

In [14]:
city_revenues[1]

5555

In [15]:
city_revenues['Toronto']

5555

In [16]:
city_revenues[-1]

6500

In [17]:
city_revenues['Toronto':]

Toronto    5555
Tokyo      6500
dtype: int64

In [18]:
colors = pd.Series(
    ['red', 'purple', 'blue', 'green', 'yellow'],
    index=[1, 2, 3, 5, 8]
)
colors

1       red
2    purple
3      blue
5     green
8    yellow
dtype: object

In [19]:
colors[1]

'red'

In [20]:
colors.loc[1]

'red'

In [21]:
colors.iloc[1]

'purple'

In [22]:
colors.iloc[1:3]

2    purple
3      blue
dtype: object

In [23]:
colors.loc[3:8]

3      blue
5     green
8    yellow
dtype: object

In [24]:
colors.iloc[-2]

'green'

# Accessing Data in a DataFrame

In [25]:
city_data.revenue

Amsterdam    4200
Tokyo        6500
Toronto      5555
Name: revenue, dtype: int64

In [26]:
city_data.loc['Amsterdam']

revenue           4200.0
employee_count       5.0
Name: Amsterdam, dtype: float64

In [27]:
city_data.iloc[0]

revenue           4200.0
employee_count       5.0
Name: Amsterdam, dtype: float64

In [28]:
city_data.loc['Tokyo': 'Toronto']

Unnamed: 0,revenue,employee_count
Tokyo,6500,8.0
Toronto,5555,


In [29]:
city_data.loc['Tokyo': 'Toronto', 'revenue']

Tokyo      6500
Toronto    5555
Name: revenue, dtype: int64

In [30]:
city_revenues

Amsterdam    4200
Toronto      5555
Tokyo        6500
dtype: int64

In [31]:
city_revenues.sum()

16255

In [32]:
city_revenues.max()

6500

In [33]:
city_revenues.min()

4200

In [34]:
city_revenues.mean()

5418.333333333333

# Combining Multiple Datasets

In [35]:
city_data

Unnamed: 0,revenue,employee_count
Amsterdam,4200,5.0
Tokyo,6500,8.0
Toronto,5555,


It has a `revenue` and `employee_count` column. Now create another `DataFrame` with the same columns.

In [36]:
further_city_data = pd.DataFrame(
    {
        'revenue': [7000, 3400],
        'employee': [2, 2]
    },
    index=['New York', 'Barcelona']
)

Using the `concat()` function in Pandas, these two `DataFrames` can be combined.

In [38]:
all_city_data = pd.concat([city_data, further_city_data], sort=False)
all_city_data

Unnamed: 0,revenue,employee_count,employee
Amsterdam,4200,5.0,
Tokyo,6500,8.0,
Toronto,5555,,
New York,7000,,2.0
Barcelona,3400,,2.0


**Make sure to explicitly set the sort keyword argument**. It is not required, but Pandas recently changed the default value from `True` to `False`. Until the new versions are widely used, setting the keyword argument explicitly will help avoid confusion.

Notice that the `concat()` function combined the `DataFrames` using rows. By setting the axis keyword argument to **1**, you can combine on columns.

Suppose you have a new DataFrame with different columns but the same index as the all_city_data DataFrame.

In [40]:
city_countries = pd.DataFrame({
    'country': ['Holland', 'Japan', 'Holland', 'Canada', 'Spain'],
    'capital': [1, 1, 0, 0, 0]
}, index=['Amsterdam', 'Tokyo', 'Rotterdam', 'Toronto', 'Barcelona'])

Now you can call `concat()`, give it a list of the `DataFrames` to combine, and set the axis to **1** to add the new columns to the `DataFrame`.

In [42]:
cities = pd.concat([all_city_data, city_countries], axis=1, sort=False)
cities

Unnamed: 0,revenue,employee_count,employee,country,capital
Amsterdam,4200.0,5.0,,Holland,1.0
Tokyo,6500.0,8.0,,Japan,1.0
Toronto,5555.0,,,Canada,0.0
New York,7000.0,,2.0,,
Barcelona,3400.0,,2.0,Spain,0.0
Rotterdam,,,,Holland,0.0


 To eliminate `NaN` values, set the join keyword argument to `'inner'`. The inner join will only keep rows with indexes in both DataFrames.

In [43]:
cities = pd.concat([all_city_data, city_countries], axis=1, sort=False, join='inner')
cities

Unnamed: 0,revenue,employee_count,employee,country,capital
Amsterdam,4200,5.0,,Holland,1
Tokyo,6500,8.0,,Japan,1
Toronto,5555,,,Canada,0
Barcelona,3400,,2.0,Spain,0


This does not remove all `NaN` because the index is still in both `DataFrames`. By default, the join is an outer join, which includes all rows.

Using the `merge()` function, you can specify a column to merge on.

In [44]:
countries = pd.DataFrame({
    'population_millions': [17, 127, 37],
    'continent': ['Europe', 'Asia', 'North America']
}, index =['Holland', 'Japan', 'Canada'])

The `countries DataFrame` uses the country name as the index, but the `cities DataFrame` uses the country name as a column. With the `merge()` method, specify the column to merge on with the `left_on` keyword argument.

In [45]:
pd.merge(cities, countries, left_on='country', right_index=True)

Unnamed: 0,revenue,employee_count,employee,country,capital,population_millions,continent
Amsterdam,4200,5.0,,Holland,1,17,Europe
Tokyo,6500,8.0,,Japan,1,127,Asia
Toronto,5555,,,Canada,0,37,North America


 Notice also the `right_index` keyword argument is set to `True`. This means that the `DataFrame` on the **right side**, `countries`, will be joined on the **index**.

The return value includes countries that are present in both the `'country'` column in the `cities DataFrame` and the index of the `countries DataFrame`, and this is an inner join. 

The default is an inner join, but the `how` keyword argument can also use another join type. Here, the left join includes all rows in the cities `DataFrame`.

The country data will be added to those in which the index matches, with `NaN` for those who don’t. 

In [46]:
pd.merge(cities, countries, left_on='country', right_index=True, how='left') 

Unnamed: 0,revenue,employee_count,employee,country,capital,population_millions,continent
Amsterdam,4200,5.0,,Holland,1,17.0,Europe
Tokyo,6500,8.0,,Japan,1,127.0,Asia
Toronto,5555,,,Canada,0,37.0,North America
Barcelona,3400,,2.0,Spain,0,,
