In [134]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.semi_supervised import label_propagation
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [135]:
def shuffle(df, n=1, axis=0):     
    df = df.copy()
    for _ in range(n):
        df.apply(np.random.shuffle, axis=axis)
    return df

In [321]:
df_cf_labeled = pd.read_csv('Data/CF_Fatma_label_confidence_judgments_finalized_clean_text_28_05_2018.csv')

In [322]:
df_cf_labeled.columns

Index([u'Unnamed: 0', u'_unit_id', u'_golden', u'_unit_state_x',
       u'_trusted_judgments', u'_last_judgment_at_x', u'violence_judgment',
       u'violence', u'violence_confidence', u'protest_judgment_x', u'protest',
       u'protest_confidence', u'created_at', u'id', u'proccd_text',
       u'Fatma_protest', u'Fatma_violence', u'clean_text'],
      dtype='object')

In [323]:
df_unlabeled = pd.read_csv('Data/unlabeled_turkish_tweets_processd.csv')

In [324]:
df_cf_labeled = df_cf_labeled[['clean_text', 'Fatma_violence']]

In [325]:
len(df_cf_labeled)

1214

In [326]:
df_unlabeled = df_unlabeled[['clean_text', 'Fatma_violence']]

In [327]:
len(df_unlabeled)

91028

In [328]:
df_tweets = pd.concat([df_cf_labeled, df_unlabeled])

In [329]:
df_tweets.columns

Index([u'clean_text', u'Fatma_violence'], dtype='object')

In [330]:
df_tweets.shape

(92242, 2)

In [331]:
df_tweets = df_tweets.dropna()
df_tweets = df_tweets.drop_duplicates()

In [332]:
df_tweets.shape

(63159, 2)

In [333]:
rng = np.random.RandomState(0)

In [334]:
X = df_tweets.clean_text
y = df_tweets.Fatma_violence

In [335]:
X = X[:2214]
y = y[:2214]

In [336]:
BOW = CountVectorizer(ngram_range=(1, 2),stop_words='english',  min_df=3)
X = BOW.fit_transform(X)

In [337]:
X.shape

(2214, 1466)

In [338]:
y.shape

(2214,)

In [339]:
pca = TruncatedSVD(n_components=2, random_state=42)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['pca1', 'pca2'])

In [340]:
ls100_prcnt_unlabeled = (label_propagation.LabelSpreading().fit(principalDf, y), y)

In [341]:
ls50_prcnt_unlabeled = (label_propagation.LabelSpreading().fit(principalDf[:1714], y[:1714]), y[:1714])

In [342]:
ls100_prcny_labeled = (label_propagation.LabelSpreading().fit(principalDf[:1214], y[:1214]), y[:1214])

In [343]:
rbf_svc = (svm.SVC(kernel='linear').fit(principalDf[:1214], y[:1214]), y[:1214])

In [344]:
# step size in the mesh
h = .02
# create a mesh to plot in
x_min, x_max = principalDf.pca1.min() - 1, principalDf.pca1.max() + 1
y_min, y_max = principalDf.pca2.min() - 1, principalDf.pca2.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

In [345]:
# title for the plots
titles = ['Label Spreading 1000 unlabeled data - 1214 labeled',
          'Label Spreading 500 unlabeled data- 1214 labeled',
          'Label Spreading 0 unlabeled data- 1214 labeled',
          'SVC with rbf kernel - 1214 labeled']

In [346]:
color_map = {-1: (1, 1, 1), 0: (0, 0, .9), 1: (1, 0, 0), 2: (.8, .6, 0)}

In [347]:
for i, (clf, y_train) in enumerate((ls100_prcnt_unlabeled,ls50_prcnt_unlabeled, ls100_prcny_labeled, rbf_svc)):
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    plt.subplot(2, 2, i + 1)
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
    plt.axis('off')

    # Plot also the training points
    colors = [color_map[y] for y in y_train]
    plt.scatter(principalDf.pca1, principalDf.pca2, c=colors, edgecolors='black')

    plt.title(titles[i])
    
plt.suptitle("Unlabeled points are colored white", y=0.1)
plt.show()    

TypeError: 'LabelSpreading' object is not iterable

In [317]:
principalDf.shape

(2214, 2)