### Import all necessary files

In [1]:
import matplotlib.pyplot as plt
from tweets import *
from evaluation import *

precomputedDataToRead = ['dbscan_11x50.data', 'tidbscan_11x50.data', 'tidbscan_11xall.data', 'teee.data']
ALGORYTHMS = {0:'dbscan', 1:'dbscan_sklearn', 2:'tidbscan', 3: 'swdbscan', 4:'sw_tidbscan'}

chosen_algorythm = 0

[ 0  0  0  1  1  1  1  2  2  2  2 -1  2  1  1  2  2  2  0]
DBSCAN(eps=4, min_samples=3)


### 0 Read earlier computed data

In [None]:
# Read choosen dataset and go straight to -> 4 Results <-
d = read_computed_data(precomputedDataToRead[3])
tweets, class_vector, tweet_vector, results, epsilon_start, epsilon_end, n, distances = d
# Read choosen dataset and go straight to -> 4 Results <-

### 1 Get vectors from table of tweets

In [2]:
# DBSCAN set True, TIDBSCAN set False
tweets, class_vector = read_n_tweets_from_data(DATA, 50) # Provide number of tweets per account
tweet_vector = np.array(get_vectors(tweets))

### 2 Find out epsilon
Find out epsilon by looking at distribution of max, min, median euclidean distances between all points.

In [3]:
%%time
distances = None
# distances = create_and_save_distances_list(tweet_vector, save=False) # Compute distances
# # distances = read_distances_list() # Read previously computed distances
# minimum, median, maximum = get_basic_statistics(distances)
# plot_basic_statistics(minimum, median, maximum)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.63 µs


### 3 Group vectors for multiple epsilon
Now after analyzing distribution of distances between points
do clustering n times for epsilon_start and delta_epsilon parameters

In [4]:
%%time
n = 1 #15
epsilon_start = 0.65
epsilon_end = 1.2

switchAlgorytm = {
  'dbscan': lambda : dbscan(tweet_vector, epsilon_start, epsilon_end, n),
  'dbscan_sklearn': lambda : dbscan_sklearn(tweet_vector, epsilon_start, epsilon_end, n),
  'tidbscan': lambda : tidbscan(tweet_vector, epsilon_start, epsilon_end, n),
  'swdbscan': lambda : swdbscan(tweet_vector, epsilon_start, epsilon_end, n),
  'sw_tidbscan': None
}

results = switchAlgorytm[ALGORYTHMS[chosen_algorythm]]()

CPU times: user 47.1 s, sys: 123 ms, total: 47.2 s
Wall time: 47.4 s


## 4.0 Saving data

In [None]:
# dataToSave = [tweets, class_vector, tweet_vector, results, epsilon_start, epsilon_end, n, distances]  
# import sys
# sys.setrecursionlimit(3895)
# save_data(dataToSave, 'dump/teee.data')

### 4 Results

Create dataframe from results to visualise it


In [5]:
%%time
if chosen_algorythm == 1:
    df = get_df_for_test_results(results, class_vector, epsilon_start, epsilon_end, n)
elif chosen_algorythm in {0, 2}:
    df = get_df_for_results(results, class_vector, epsilon_start, epsilon_end, n)

CPU times: user 25.1 ms, sys: 1.02 ms, total: 26.1 ms
Wall time: 26.8 ms


### Visualisation
Plot all clusture for diffrent sigma

%%time
ax = df.plot.hist(bins=60, alpha=0.5, legend=False)
ax.plot()
plt.show()

In [6]:
%%time
#a = df.hist(bins=60, alpha=1, legend=False, figsize=(60,n*4), layout=(n,11))

CPU times: user 7 µs, sys: 1e+03 ns, total: 8 µs
Wall time: 15.3 µs


### Clustering evaluation


#### Silhouette Score

In [9]:
silhouette = get_silhouette_score_xdbscan(results)

In [8]:
if chosen_algorythm == 1:
    silhouette = get_silhouette_score_dbscan_sklearn(results, tweet_vector)
elif chosen_algorythm in {0, 2}:
    silhouette = get_silhouette_score_xdbscan(results)

In [10]:
silhouette

[-0.036604535]

#### Purity Score

In [None]:
if chosen_algorythm == 1:
    sc = get_purity_for_dbscan_sklearn(class_vector, results, len(results))
elif chosen_algorythm in {0, 2}:
    sc = get_purity_for_xdbscan(class_vector, results, len(results))

epsilons = get_epsilon_array(epsilon_start, epsilon_end, n)


plt.plot(epsilons, sc, 'ro')
plt.axis([0.1, 2, 0, 1])
plt.xlabel("Epsilon")
plt.ylabel("Purity")
plt.show()

max_purity=max(sc)
eps_for_max_purity=epsilons[sc.index(max_purity)]
print("Max purity score is P={}, for epsilon={}. ".format(round(max_purity,3), eps_for_max_purity))

In [None]:
gr = get_all_groups_in_result(results, sc.index(max_purity), tweets)
[group_id,number_of_points,indexes_of_points,list_of_tweets, stats,  points] = gr

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

ax.plot(group_id, stats["med_sent_length"],'*', alpha=0.5)
ax.bar(group_id, number_of_points, alpha=0.5)
ax.plot(group_id, stats["min_sent_length"],'v', alpha=0.5)
ax.plot(group_id, stats["max_sent_length"],'^', alpha=0.5)
ax.set_xlabel("Id grupy")
ax.set_ylabel("Liczność")
fig.legend(["Mediana długości zdania", "Min długość zdania","Max długość zdania","Histogram"])
fig.savefig('filename.png', dpi=900)
plt.show()

In [None]:
for num in range(len(list_of_tweets)):
    print('\033[1m' + "Here is list of {} tweets for group_nr={}".format(len(list_of_tweets[num]),num-1)+'\033[0m')
    a = [print(tw + "\n") for tw in list_of_tweets[num]]
    print("\n\n\n")
