In [1]:
import numpy as np
from xml.etree import ElementTree
import os

In [2]:
textPath='./Text'
nontextPath='./Nontext'

In [3]:
def Width(a):
    return int(a['Right'])-int(a['Left'])
def Height(a):
    return int(a['Bottom'])-int(a['Top'])
def Area(a):
    return Width(a)*Height(a)
def w2f(x):
    feature=[]
    a=x.attrib
    sizes=x[0].attrib
    
    area=Area(sizes)
    width=Width(sizes)
    height=Height(sizes)
    blackCount=int(a['BlackCount'])
    horzStrokeCount=int(a['HorzStrokesCount'])
    vertStrokeCount=int(a['VertStrokesCount'])
    maxHorzStrokeLength=int(a['MaxHorzStrokeLength'])
    whiteHolesCount=int(a['WhiteHolesCount'])

    feature.append((area-blackCount) / ((horzStrokeCount + height)*height) )
    feature.append((horzStrokeCount+vertStrokeCount)/max(width, height))
    feature.append(blackCount/(horzStrokeCount+vertStrokeCount))

    feature.append(maxHorzStrokeLength/horzStrokeCount)
    feature.append(blackCount/area)
    feature.append(whiteHolesCount)
    feature.append(100*whiteHolesCount/horzStrokeCount)
    feature.append(100*whiteHolesCount/vertStrokeCount)
    feature.append(100*vertStrokeCount/width)
    feature.append(100*horzStrokeCount/height)
        
    return feature
    
def getFeatures(f):
    xml=ElementTree.parse(f)
    features=[]
    for x in xml.iter("WordFragment"):
        textFeature=w2f(x)
        if len(textFeature) > 0:
            features.append(textFeature)
    return features
        
def getTrainFromXML(path):
    train = []
    
    for f in os.listdir(path):
        if f.endswith('.xml'):
            a=getFeatures(path+'/'+f)
            
            train += a
    return train

In [4]:
X_text=getTrainFromXML(textPath)
Y_text=[1 for x in X_text]
X_nontext=getTrainFromXML(nontextPath)
Y_nontext=[0 for x in X_nontext]

In [5]:
X=X_text+X_nontext
y=Y_text+Y_nontext

In [6]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [7]:
rf=RandomForestClassifier()
gb=GradientBoostingClassifier()
svc=LinearSVC()
lg=LogisticRegression()

In [8]:
print 'Random Forest f1={}'.format(np.mean(cross_val_score(rf, X, y, cv=10, scoring='f1')))
print 'GradientBoosting f1={}'.format(np.mean(cross_val_score(gb, X, y, cv=10, scoring='f1')))
print 'LinearSVC f1={}'.format(np.mean(cross_val_score(svc, X, y, cv=10, scoring='f1')))
print 'LogisticRegression f1={}'.format(np.mean(cross_val_score(lg, X, y, cv=10, scoring='f1')))

Random Forest f1=0.847734223648
GradientBoosting f1=0.844894400101
LinearSVC f1=0.626072951996
LogisticRegression f1=0.84151447973


In [9]:
from sklearn.model_selection import GridSearchCV
parRF = [{'n_estimators': [20, 30, 40, 50],
              'max_features': [None, 'sqrt', 'log2'],
              'criterion': ['gini', 'entropy']}]

clf = GridSearchCV(RandomForestClassifier(), parRF, cv=10, scoring='f1')
clf.fit(X, y)
print(clf.best_params_)
print(max(clf.cv_results_['mean_test_score']))

{'max_features': 'sqrt', 'n_estimators': 40, 'criterion': 'entropy'}
0.853174781932


In [10]:
parGB = [{'n_estimators': [100, 250],
              'max_features': [None, 'sqrt', 'log2'],
              'loss': ['deviance', 'exponential']}]

clf = GridSearchCV(GradientBoostingClassifier(), parGB, cv=10, scoring='f1')
clf.fit(X, y)
print(clf.best_params_)
print(max(clf.cv_results_['mean_test_score']))

{'max_features': 'sqrt', 'loss': 'exponential', 'n_estimators': 100}
0.849316226989


In [11]:
parLG = [{'penalty': ['l2', 'l1'],
              'C': [0.1, 0.5, 1.0, 10.0, 100.0]}]

clf = GridSearchCV(LogisticRegression(), parLG, cv=10, scoring='f1')
clf.fit(X, y)
print(clf.best_params_)
print(max(clf.cv_results_['mean_test_score']))

{'penalty': 'l2', 'C': 0.1}
0.842295972135


In [12]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)



In [13]:
rf=RandomForestClassifier(max_features='sqrt',n_estimators=40,criterion='entropy')
rf.fit(x_train, y_train)
y_rf=rf.predict_proba(x_test)

In [14]:
gb=GradientBoostingClassifier(max_features='sqrt',n_estimators=100,loss='exponential')
gb.fit(x_train, y_train)
y_gb=gb.predict_proba(x_test)

In [16]:
lg=LogisticRegression(penalty='l2',C=0.1)
lg.fit(x_train, y_train)
y_lg=lg.predict_proba(x_test)

In [32]:
y_all=[int((y_lg[i][1]+y_gb[i][1]+y_rf[i][1])>1.5) for i in xrange(len(y_test))]

In [33]:
from sklearn.metrics import f1_score

In [34]:
f1_score(y_test, y_all)

0.88252090556801965