# Scatter Plots

### Scatter plots are important in statistics because they can show the extent of correlation, if any, between the values of observed quantities or phenomena (called variables). If no correlation exists between the variables, the points appear randomly scattered on the coordinate plane

In [1]:
# import libraries
from matplotlib import pyplot as plt
%matplotlib notebook
from random import randint, uniform
from pandas import read_csv

In [2]:
# pick custom style
plt.style.use('seaborn')

In [3]:
# some random data
x = [randint(0, 10) for _ in range(20)]
y = [randint(0, 10) for _ in range(20)]

In [4]:
# colors: #2E8BCC, #C2CC2E
# c --> point color
# edgecolor --> color of circumference
# linewidth --> thickness of circumference
# s --> size of points
# aplha --> point see through
plt.scatter(x, y, c='#2E8BCC', edgecolor='#000000', linewidth=1, alpha=0.9, s=80)


plt.xlabel('x')
plt.ylabel('y')
plt.title('Scatter Plot')
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

#### Having the above statement in mind,  we can clearly see that there's no correlation between x and y variables

## Color map

In [5]:
# Each point needs to represent a weight in order to make sense for color map
# so we create a random list of 20 elements, and each element corresponds to each (x,y) pair
groups = [randint(0, 10) for _ in range(20)]

In [6]:
# cmap=plt.cm.Set1_r
# In this case, we pass groups list for the colors
# passing an argument to cmap parameter
plt.scatter(x, y, c=groups, cmap='summer', edgecolor='#000000', linewidth=1, alpha=0.9, s=80)

# then callling colobar function
cbar = plt.colorbar()
cbar.set_label('Satisfaction')


plt.xlabel('x')
plt.ylabel('y')
plt.title('Scatter Plot')
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

  cbar = plt.colorbar()


## Testing in real world data

In [7]:
data = read_csv('data.csv')
data.head()

Unnamed: 0,view_count,likes,ratio
0,8036001,324742,96.91
1,9378067,562589,98.19
2,2182066,273650,99.38
3,6525864,94698,96.25
4,9481284,582481,97.22


In [8]:
# These are 200 yt videos
view_count = data['view_count']
likes = data['likes']
like_dislike_ratio = data['ratio']

In [9]:
# likes per views scatter
# coloring is proportional of like/dislike ratio
plt.scatter(view_count, likes, c=like_dislike_ratio, edgecolor='#000000',
            linewidth=1, alpha=0.9, cmap='summer')

# calling colorbar
cbar = plt.colorbar()
cbar.set_label('like/dislike ratio')

# labeling
plt.title('Likes Per Views')
plt.xlabel('views')
plt.ylabel('likes')

# ploting in logarithmic scale to minimize diffs within outliers
plt.xscale('log')
plt.yscale('log')

plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

  cbar = plt.colorbar()


### We can easily see that, the more the views, the more the likes...which is intuitive.