In [1]:
import pandas as pd
import numpy as np

### Big Countries

In [2]:
world_dict = {
    'name':['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola'],
    'continent': ['Asia', 'Europe', 'Africa', 'Europe', 'Africa'],
    'area': [652230, 28748, 2381741, 468, 1246700],
    'population': [25500100, 2831741, 37100000, 78115, 20609294],
    'gdp':[20343000000, 12960000000, 188681000000, 3712000000, 100990000000]
}

In [3]:
world = pd.DataFrame(world_dict)

In [4]:
world.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        5 non-null      object
 1   continent   5 non-null      object
 2   area        5 non-null      int64 
 3   population  5 non-null      int64 
 4   gdp         5 non-null      int64 
dtypes: int64(3), object(2)
memory usage: 328.0+ bytes


Problem Statement

A country is big if:

it has an area of at least three million (i.., 3000000 km),  r
it has a population of at least twenty-five million (i ., 250000.0
).
Write a solution to find the name, population, and area of the big countries.

Return the result table in any order.

In [5]:
world[(world['area']>=3000000) | (world['population']>=25000000)][['name', 'area', 'population']]

Unnamed: 0,name,area,population
0,Afghanistan,652230,25500100
2,Algeria,2381741,37100000


In [6]:
world[['name', 'area', 'population']][(world['area']>=3000000) | (world['population']>=25000000)]

Unnamed: 0,name,area,population
0,Afghanistan,652230,25500100
2,Algeria,2381741,37100000


In [7]:
condition = (world['area']>=3000000) | (world['population']>=25000000)

In [8]:
world.where(condition)

Unnamed: 0,name,continent,area,population,gdp
0,Afghanistan,Asia,652230.0,25500100.0,20343000000.0
1,,,,,
2,Algeria,Africa,2381741.0,37100000.0,188681000000.0
3,,,,,
4,,,,,


### Recyclable and Low Fat Products

In [9]:
Products_dict = {
    'product_id':[0,1,2,3,4],
    'low_fats':['Y', 'Y', 'N', 'Y', 'N'],
    'recyclable':['N', 'Y', 'Y', 'Y', 'N']
}

In [10]:
Products=pd.DataFrame(Products_dict)

Products

Unnamed: 0,product_id,low_fats,recyclable
0,0,Y,N
1,1,Y,Y
2,2,N,Y
3,3,Y,Y
4,4,N,N


Write a solution to find the ids of products that are both low fat and recyclable.

In [11]:
Products[(Products.low_fats=='Y') & (Products.recyclable=='Y')][['product_id']]

Unnamed: 0,product_id
1,1
3,3


In [12]:
Products[['product_id']][(Products.low_fats=='Y') & (Products.recyclable=='Y')]

Unnamed: 0,product_id
1,1
3,3


### Customers who never order

In [13]:
customers_dict = {
    'id':[1,2,3,4],
    'name':['joe', 'henry', 'sam', 'max']
}

orders_dict = {
    'id':[1,3],
    'customerId':[2,1]
}

In [14]:
customers = pd.DataFrame(customers_dict)
print(customers)

orders = pd.DataFrame(orders_dict)
print(orders)

   id   name
0   1    joe
1   2  henry
2   3    sam
3   4    max
   id  customerId
0   1           2
1   3           1


Write a solution to find all customers who never order anything.

In [15]:
merged = pd.merge(orders, customers, how='right', left_on='customerId', right_on='id', suffixes=('_orders', '_customer'))
merged

Unnamed: 0,id_orders,customerId,id_customer,name
0,3.0,1.0,1,joe
1,1.0,2.0,2,henry
2,,,3,sam
3,,,4,max


In [16]:
merged[pd.isna(merged['customerId'])][['id_customer']]

Unnamed: 0,id_customer
2,3
3,4


Approach - 2

In [17]:
merged = pd.merge(orders, customers, how='right', left_on='customerId', right_on='id', suffixes=('_orders', '_customer'), indicator=True)
merged

Unnamed: 0,id_orders,customerId,id_customer,name,_merge
0,3.0,1.0,1,joe,both
1,1.0,2.0,2,henry,both
2,,,3,sam,right_only
3,,,4,max,right_only


In [18]:
merged[merged['_merge']=='right_only'][['name']].rename(columns={'name':'Customers'})

Unnamed: 0,Customers
2,sam
3,max


### Article Views 1

Methods used to solve the problem.

- DataFrame
- unique
- sort_values

In [19]:
views_dict = {
     'article_id': [1, 1, 2, 2, 4, 3, 3],
     'author_id': [3, 3, 7, 7, 7, 4, 4],
     'viewer_id': [5, 6, 7, 6, 1, 4, 4],
     'view_date': ['2019-08-01', '2019-08-02', '2019-08-01', '2019-08-02', '2019-07-22', '2019-07-21', '2019-07-21'] 
}

In [20]:
views = pd.DataFrame(views_dict)

In [21]:
views

Unnamed: 0,article_id,author_id,viewer_id,view_date
0,1,3,5,2019-08-01
1,1,3,6,2019-08-02
2,2,7,7,2019-08-01
3,2,7,6,2019-08-02
4,4,7,1,2019-07-22
5,3,4,4,2019-07-21
6,3,4,4,2019-07-21


Write a solution to find all the authors that viewed at least one of their own articles.

Return the result table sorted by id in ascending order.

In [22]:
filtered_views = views[views.author_id == views.viewer_id]['author_id'].unique()

filtered_views # is a array

array([7, 4], dtype=int64)

In [23]:
# Created a DataFrame using array (filtered_views)
filtered_authors = pd.DataFrame(filtered_views, columns=['id'])

filtered_authors.sort_values(by='id')

Unnamed: 0,id
1,4
0,7
