In [None]:
import pandas as pd
import numpy as np
from numpy import matlib as mb

In [None]:
year = 2018

In [None]:
df = pd.read_csv('out/datasets-hits-%s.csv' % year)

In [None]:
df['nb_hits'].describe()

In [None]:
# source: https://dataplatform.cloud.ibm.com/analytics/notebooks/54d79c2a-f155-40ec-93ec-ed05b58afa39/view?access_token=6d8ec910cf2a1b3901c721fcb94638563cd646fe14400fecbb76cea6aaae2fb1

# pull out the list from pandas frame
values=list(df['nb_hits'])

#get coordinates of all the points
nPoints = len(values)
allCoord = np.vstack((range(nPoints), values)).T
#np.array([range(nPoints), values])

# get the first point
firstPoint = allCoord[0]
# get vector between first and last point - this is the line
lineVec = allCoord[-1] - allCoord[0]
lineVecNorm = lineVec / np.sqrt(np.sum(lineVec**2))

# find the distance from each point to the line:
# vector between all points and first point
vecFromFirst = allCoord - firstPoint

# To calculate the distance to the line, we split vecFromFirst into two 
# components, one that is parallel to the line and one that is perpendicular 
# Then, we take the norm of the part that is perpendicular to the line and 
# get the distance.
# We find the vector parallel to the line by projecting vecFromFirst onto 
# the line. The perpendicular vector is vecFromFirst - vecFromFirstParallel
# We project vecFromFirst by taking the scalar product of the vector with 
# the unit vector that points in the direction of the line (this gives us 
# the length of the projection of vecFromFirst onto the line). If we 
# multiply the scalar product by the unit vector, we have vecFromFirstParallel
scalarProduct = np.sum(vecFromFirst * mb.repmat(lineVecNorm, nPoints, 1), axis=1)
vecFromFirstParallel = np.outer(scalarProduct, lineVecNorm)
vecToLine = vecFromFirst - vecFromFirstParallel

# distance to line is the norm of vecToLine
distToLine = np.sqrt(np.sum(vecToLine ** 2, axis=1))

# knee/elbow is the point with max distance value
idxOfBestPoint = np.argmax(distToLine)

print("Knee of the curve is at index =",idxOfBestPoint)
print("Knee value =", values[idxOfBestPoint])



In [None]:
df.iloc[:idxOfBestPoint].to_csv("out/top-%s.csv" % year)

In [None]:
df.iloc[idxOfBestPoint:].to_csv("out/bottom-%s.csv" % year)