In [28]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.semi_supervised import label_propagation
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [51]:
df_cf = pd.read_csv('Data/CF_Fatma_label_confidence_judgments_finalized_clean_text_28_05_2018.csv')

In [52]:
df_cf.columns

Index([u'Unnamed: 0', u'_unit_id', u'_golden', u'_unit_state_x',
       u'_trusted_judgments', u'_last_judgment_at_x', u'violence_judgment',
       u'violence', u'violence_confidence', u'protest_judgment_x', u'protest',
       u'protest_confidence', u'created_at', u'id', u'proccd_text',
       u'Fatma_protest', u'Fatma_violence', u'clean_text'],
      dtype='object')

In [53]:
rng = np.random.RandomState(0)

In [54]:
X = df_cf.clean_text
y = df_cf.Fatma_violence

In [55]:
BOW = CountVectorizer(ngram_range=(1, 2),stop_words='english',  min_df=3)
X = BOW.fit_transform(X)

In [56]:
X.shape

(1214, 775)

In [57]:
y.shape

(1214,)

In [58]:
pca = TruncatedSVD(n_components=2, random_state=42)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['pca1', 'pca2'])

In [59]:
y_30 = np.copy(y)
y_30[rng.rand(len(y)) < 0.3] = -1

In [60]:
y_50 = np.copy(y)
y_50[rng.rand(len(y)) < 0.5] = -1

In [61]:
ls30 = (label_propagation.LabelSpreading().fit(principalDf, y_30),
        y_30)
ls50 = (label_propagation.LabelSpreading().fit(principalDf, y_50),
        y_50)
ls100 = (label_propagation.LabelSpreading().fit(principalDf, y), y)

In [62]:
rbf_svc = (svm.SVC(kernel='linear').fit(principalDf, y), y)

In [63]:
# step size in the mesh
h = .02
# create a mesh to plot in
x_min, x_max = principalDf.pca1.min() - 1, principalDf.pca1.max() + 1
y_min, y_max = principalDf.pca2.min() - 1, principalDf.pca2.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

In [64]:
x_min

-1.0

In [65]:
x_max

4.4965238690130871

In [66]:
y_min

-1.7098372637443888

In [67]:
y_max

5.6684535740103987

In [68]:
xx.shape

(369, 275)

In [69]:
yy.shape

(369, 275)

In [70]:
# title for the plots
titles = ['Label Spreading 30% data',
          'Label Spreading 50% data',
          'Label Spreading 100% data',
          'SVC with rbf kernel']

In [71]:
color_map = {-1: (1, 1, 1), 0: (0, 0, .9), 1: (1, 0, 0), 2: (.8, .6, 0)}

In [72]:
for i, (clf, y_train) in enumerate((ls30, ls50, ls100, rbf_svc)):
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    plt.subplot(2, 2, i + 1)
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
    plt.axis('off')

    # Plot also the training points
    colors = [color_map[y] for y in y_train]
    plt.scatter(principalDf.pca1, principalDf.pca2, c=colors, edgecolors='black')

    plt.title(titles[i])
    
plt.suptitle("Unlabeled points are colored white", y=0.1)
plt.show()    