In [3]:
import pandas as pd
import numpy as np
import lda
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

In [28]:
df_cf_data = pd.read_csv('Data/CF_Fatma_label_confidence_judgments_finalized_clean_text_28_05_2018.csv')

In [29]:
df_unlabeled_tweets = pd.read_csv('Data/unlabeled_turkish_tweets_processd.csv')
df_unlabeled_tweets = df_unlabeled_tweets.dropna()
df_unlabeled_tweets = df_unlabeled_tweets.drop_duplicates()

In [30]:
df_cf_data = df_cf_data[['clean_text', 'Fatma_violence']]
df_unlabeled_tweets = df_unlabeled_tweets[['clean_text', 'Fatma_violence']]
df_tweets = pd.concat([df_cf_data, df_unlabeled_tweets])

### SVD Labeled Data

In [41]:
n_topics = 4 # number of topics
n_iter = 500 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=5, stop_words='english')
cvz = cvectorizer.fit_transform(df_cf_data.clean_text)

In [42]:
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=4)
lsi_Z = lsi_model.fit_transform(cvz)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(1214, 4)


In [43]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [44]:
print("LSI Model:")
print_topics(lsi_model, cvectorizer)
print("=" * 20)

LSI Model:
Topic 0:
[('police', 0.8718562469979186), ('square', 0.2171728374096674), ('gas', 0.18810026703033117), ('people', 0.15608124809970003), ('tear', 0.15068607490458125), ('erdogan', 0.11556924004007604), ('riot', 0.09498163735892766), ('ankara', 0.08247301914043668), ('violence', 0.07844153044830099), ('attack', 0.0744780137704591)]
Topic 1:
[('erdogan', 0.8920633089673118), ('says', 0.175582091957604), ('people', 0.16463127373564582), ('minister', 0.12261595810007266), ('prime', 0.10193634253516726), ('media', 0.09693102467495528), ('world', 0.08141696379309585), ('news', 0.06526952441576764), ('opposition', 0.05843772471424674), ('united', 0.050931761430265676)]
Topic 2:
[('need', 0.5408581648891233), ('understand', 0.5199191929407079), ('things', 0.5196089446418974), ('violence', 0.1081985882773863), ('want', 0.10407347116717146), ('stop', 0.1018786148411038), ('good', 0.0986287209253838), ('burgers', 0.09084919767210617), ('nice', 0.08977010054938288), ('better', 0.0894878

In [None]:
# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results 
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

# 20-D -> 2-D
tsne_lda = tsne_model.fit_transform(lsi_Z)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1214 samples in 0.001s...
[t-SNE] Computed neighbors for 1214 samples in 0.048s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1214
[t-SNE] Computed conditional probabilities for sample 1214 / 1214
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.380810


In [36]:
n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array(["#1f77b4", "#FF0000",'#6AA84F', '#E69138', '#A64D79']) #, 

In [37]:
_lda_keys = []
for i in range(lsi_Z.shape[0]):
  _lda_keys +=  lsi_Z[i].argmax(),

In [38]:
topic_summaries = []
topic_word = lsi_model.components_# all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
  topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
  topic_summaries.append(' '.join(topic_words)) # append!

In [39]:
threshold = 0.5
_idx = np.amax(lsi_Z, axis=1) > threshold  # idx of doc that above the threshold
X_topics = lsi_Z[_idx]

In [40]:
topic_summaries

['police square gas people tear', 'erdogan says people minister prime']