# Experiments

In [1]:
import vectorizers
import kernels
from nltk.tree import Tree
import numpy as np
from sklearn import svm
from sklearn import cross_validation
from sklearn import feature_extraction

## loading data

In [2]:
#loading files
with open('../data/3gables.csv','r') as f:
	s1 = f.read()

with open('../data/100west.csv','r') as f:
	s2 = f.read()


int2cl = {0:'descriptive', 1:'argumentative', 2:'narrative',3:'explicative'}

t1 = Tree.fromstring(s1)
t2 = Tree.fromstring(s2)
t_list = [t1,t2,t2,t1,t1,t2]

#computing dicts
D = np.array([vectorizers.build_norm_vect(t) for t in t_list])
y = np.array([0,1,1,0,0,1])

#transforming dict to vect
v = feature_extraction.DictVectorizer(sparse=False)
X = v.fit_transform(D)
Y = v.inverse_transform(X)

## loading classifiers

In [3]:
clf2 = svm.LinearSVC()
clf2.fit(X,y)
print(clf2.predict(X))
print [int2cl[x] for x in clf2.predict(X)]

scores = cross_validation.cross_val_score(clf2,X,y,cv=2)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


#clf3 = svm.SVC(kernel='rbf')
#clf3.fit(X,y)
#clf3.predict(X)

[0 1 1 0 0 1]
['descriptive', 'argumentative', 'argumentative', 'descriptive', 'descriptive', 'argumentative']
Accuracy: 1.00 (+/- 0.00)


## using precomputed kernels

In [9]:
def compute_kernel(X,Y,kernel=kernels.rbf_kernel):
	"""computes a gram matrix K with matrices X and Y 
	such as K[i,j] = kernel(X[i],Y[j]).
	"""
	K = np.zeros((len(X),len(Y)))
	for i,x in enumerate(X):
		for j,y in enumerate(Y):
			K[i, j] = kernel(x,y)
	return K

#precomputing kernels for train
K = compute_kernel(D,D)
print(K)

clf1 = svm.SVC(kernel="precomputed")
clf1.fit(K,y)

print(clf1.predict(K))
scores = cross_validation.cross_val_score(clf2,X,y,cv=2)


[[ 1.          0.97938667  0.97938667  1.          1.          0.97938667]
 [ 0.97938667  1.          1.          0.97938667  0.97938667  1.        ]
 [ 0.97938667  1.          1.          0.97938667  0.97938667  1.        ]
 [ 1.          0.97938667  0.97938667  1.          1.          0.97938667]
 [ 1.          0.97938667  0.97938667  1.          1.          0.97938667]
 [ 0.97938667  1.          1.          0.97938667  0.97938667  1.        ]]
[0 1 1 0 0 1]


### Treekernel

In [None]:
K2 = compute_kernel(t_list[:2],t_list[2:],kernels.tree_kernel)

In [None]:
print(K2)
clf4  = svm.SVC(kernel='precomputed')
clf4.fit(K2,y)

### KNN

In [16]:
from sklearn import neighbors
clf5 = neighbors.KNeighborsClassifier()
#clf5.fit(X,y)

### MaxEnt

In [17]:
from sklearn import linear_model
clf6 = linear_model.LogisticRegression()
#clf6.fit(X,y)

### Random Forest

In [15]:
from sklearn import ensemble
clf7 = ensemble.RandomForestClassifier()
#clf7.fit(X,y)

## Wikipedia extraction

In [180]:
import wikipedia as wk
p = wk.search('2010s in film')
page = wk.page(p[0])
s = page.content

In [201]:
import re
reg = re.compile('\n\n\n=== 0-9 ===\n(.*)\n\n\n== See also ==\n',re.DOTALL)
films = reg.findall(s)[0]
films = films.split('\n')

In [202]:
films = [f for f in films if(not ('===' in f) and len(f)>0)]

In [234]:
films[0]
reg2 = re.compile('\(.*\)')
titles = []
for f in films:
    rep = reg2.findall(f)
    if(len(rep)>0):
        titles.append(f.replace(rep[0],''))

In [236]:
p = wk.search(titles[0])
page = wk.page(p[0])
cont = page.content

In [238]:
reg3 = re.compile('\n\n\n== Plot ==\n(.*?)\n\n\n==',re.DOTALL)
plots = []
for t in titles:
    plots.append((t,reg3.findall(cont)[0]))