Skip to content

Commit

Permalink
Initial commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
cvangysel committed Jul 20, 2014
0 parents commit f9f5ac9
Show file tree
Hide file tree
Showing 18 changed files with 1,503 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
@@ -0,0 +1,5 @@
.DS_Store
.classpath
.project
.settings/
target/
14 changes: 14 additions & 0 deletions README.md
@@ -0,0 +1,14 @@
Listening to the Flock
======================

This repository contains a selection of the code developed in the context of my master's thesis. The goal of the thesis was the application of graph classification in order to determine the political climate of the social circles of Twitter users.

To accomplish this we crawl Twitter, a microblogging service, and retrieve connections and interactions between users. We then induce a weighted, directed graph structure from this data. Markov random walks are used in order to obtain a probability distribution over political parties, effectively modelling the distance between users and political parties. We implement the [Adsorption](http://www.esprockets.com/papers/adsorption-yt.pdf) algorithm by Baluja et al. (2008) in the MapReduce paradigm using Apache Crunch. Further we investigate the importance of certain features on the microblogging platform with respect to our problem domain. We find that retweets on Twitter are a valuable indicator of like-mindedness. Reciprocal connections, however, do not seem to exhibit this property.

This repository contains the implementation of the Adsorption algorithm in the [Apache Crunch](https://crunch.apache.org/) framework. The Apache Crunch Java library provides a framework for writing, testing, and running MapReduce pipelines. Its goal is to make pipelines that are composed of many user-defined functions simple to write, easy to test, and efficient to run.

Other parts of the thesis code not included in this repository includes tools for crawling social networks and the induction of the graph structure used as algorithm input. These, in addition to the full thesis text, are available upon request.

There are also some tests included in the repository which act as usage examples. These can be found in `src/test/java` and can be ran by executing `mvn test` (requires Maven 3.0+).

-- Christophe
78 changes: 78 additions & 0 deletions pom.xml
@@ -0,0 +1,78 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>be.stophr</groupId>
<artifactId>ml</artifactId>

<version>DEV</version>

<packaging>jar</packaging>

<name>ml</name>
<url>http://chri.stophr.be</url>

<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.crunch</groupId>
<artifactId>crunch-core</artifactId>
<version>0.10.0</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-all</artifactId>
<version>1.3</version>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
<version>1.8.4</version>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.2.1</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
<configuration>
<source>1.3</source>
<target>1.2</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
52 changes: 52 additions & 0 deletions src/main/java/be/stophr/ml/InformationTheory.java
@@ -0,0 +1,52 @@
package be.stophr.ml;

/**
* Information theory-based entropy measures.
*
* @author Christophe Van Gysel
*/
public class InformationTheory {

public static double entropy(String input) {
long[] statistics = new long[36];

for (char i : input.toLowerCase().toCharArray()) {
if (i >= 48 && i <= 57) {
statistics[i - 48]++;
} else if (i >= 97 && i <= 122) {
statistics[i - 97 + 10]++;
}
}

return entropy(statistics);
}

public static double entropy(long[] input) {
double sum = 0;
for (long i : input) {
sum += i;
}

final double[] probabilities = new double[input.length];
for (int i = 0; i < probabilities.length; i++) {
probabilities[i] = input[i] / sum;
}

return entropy(probabilities);
}

public static double entropy(double[] probabilities) {
double entropy = 0.0;

for (final double probability : probabilities) {
assert probability >= 0.0;

if (probability > 0.0) {
entropy -= probability * (Math.log(probability) / Math.log(2));
}
}

return entropy;
}

}
77 changes: 77 additions & 0 deletions src/main/java/be/stophr/ml/classification/Alphabet.java
@@ -0,0 +1,77 @@
package be.stophr.ml.classification;

import java.io.Serializable;

import com.google.common.collect.HashBiMap;

/**
* An alphabet provides a mapping between strings and tokens.
*
* It is commonly used as a mapping between internal label identifiers
* and human-readable representations.
*
* @author Christophe Van Gysel
*/
@SuppressWarnings("serial")
public class Alphabet implements Serializable {

protected final HashBiMap<Long, String> map = HashBiMap.create();
protected long nextIndex = 0;

protected boolean locked = false;

public Alphabet() {
}

public Alphabet(String... tokens) {
for (final String token : tokens) {
this.add(token);
}

this.finalize();
}

public long add(final String token) {
assert !this.locked;
assert !this.map.containsValue(token);

return this.get(token);
}

public long get(final String token) {
if (this.map.containsValue(token)) {
return this.map.inverse().get(token);
} else {
assert !this.locked;

final long index = this.nextIndex;

this.map.put(index, token);
this.nextIndex = Math.max(this.nextIndex + 1, this.map.size());

return index;
}
}

public void finalize() {
this.locked = true;
}

public String getToken(final long index) {
return this.map.get(index);
}

public int size() {
return this.map.size();
}

public boolean isLocked() {
return this.locked;
}

@Override
public String toString() {
return this.map.toString();
}

}

0 comments on commit f9f5ac9

Please sign in to comment.