In [1]:
import pandas as pd

In [2]:
%store -r X
%store -r y

In [3]:
X

0       .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1       @jessedee Know about @fludapp ? Awesome iPad/i...
2       @swonderlin Can not wait for #iPad 2 also. The...
3       @sxsw I hope this year's festival isn't as cra...
4       @sxtxstate great stuff on Fri #SXSW: Marissa M...
                              ...                        
9088                        Ipad everywhere. #SXSW {link}
9089    Wave, buzz... RT @mention We interrupt your re...
9090    Google's Zeiger, a physician never reported po...
9091    Some Verizon iPhone customers complained their...
9092    Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...
Name: text, Length: 8909, dtype: object

In [4]:
y

0       0
1       2
2       2
3       0
4       2
       ..
9088    2
9089    1
9090    1
9091    1
9092    1
Name: target, Length: 8909, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=112221, stratify=y)

In [6]:
import regex as re
import string
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['sxsw','sxswi', 'link', 'quot', 'rt'])

lemmatizer = nltk.stem.WordNetLemmatizer()

def preprocess(text):
    text = text.apply(lambda x: x.lower()).str.encode('ascii', 'ignore').str.decode('ascii')
    text = text.apply(lambda x: re.sub(r'http\S+', '', x)).apply(lambda x: re.sub(r'@\S*', '', x))

    text = text.apply(lambda x: re.split('\W+', x))
    text = text.apply(lambda x: [''.join(letter for letter in word if letter not in string.punctuation) for word in x])
    text = text.apply(lambda x: [word for word in x if word not in stopwords])
     
    return text.apply(lambda x: [lemmatizer.lemmatize(word) for word in x]).apply(lambda x: ' '.join(x))

In [7]:
X_train_processed = preprocess(X_train)

In [8]:
def build_corpus(data):
    corpus = []
    for sentence in data.iteritems():
        word_list = sentence[1].split(" ")
        corpus.append(word_list)
    return corpus

corpus = build_corpus(X_train_processed)

In [9]:
from gensim.models import Word2Vec
model = Word2Vec(corpus, size=100, min_count=1)

In [11]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(corpus, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(6681, 100)

In [14]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

def mbkmeans_clusters(
    X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [18]:
clustering, cluster_labels = mbkmeans_clusters(
    X=vectorized_docs,
    k=50,
    mb=500,
    print_silhouette_values=False,
)
# df_clusters = pd.DataFrame({
#     "text": corpus,
#     "tokens": [" ".join(text) for text in tokenized_docs],
#     "cluster": cluster_labels
# })

For n_clusters = 50
Silhouette coefficient: 0.26
Inertia:374.250441876943


In [19]:
clustering, cluster_labels = mbkmeans_clusters(
    X=vectorized_docs,
    k=20,
    mb=500,
    print_silhouette_values=False,
)

For n_clusters = 20
Silhouette coefficient: 0.28
Inertia:752.3203047448192


In [20]:
clustering, cluster_labels = mbkmeans_clusters(
    X=vectorized_docs,
    k=100,
    mb=500,
    print_silhouette_values=False,
)

For n_clusters = 100
Silhouette coefficient: 0.26
Inertia:213.4669914244418


In [21]:
clustering, cluster_labels = mbkmeans_clusters(
    X=vectorized_docs,
    k=10,
    mb=500,
    print_silhouette_values=False,
)

For n_clusters = 10
Silhouette coefficient: 0.33
Inertia:1342.6420775580673


In [22]:
clustering, cluster_labels = mbkmeans_clusters(
    X=vectorized_docs,
    k=3,
    mb=500,
    print_silhouette_values=False,
)

For n_clusters = 3
Silhouette coefficient: 0.48
Inertia:4613.803925937466
